amdfam10
From-SVN: r121625
This commit is contained in:
parent
63694bdd4e
commit
21efb4d464
154
gcc/ChangeLog
154
gcc/ChangeLog
@ -1,3 +1,157 @@
|
||||
2007-02-05 Harsha Jagasia <harsha.jagasia@amd.com>
|
||||
|
||||
* config/i386/athlon.md (athlon_fldxf_k8, athlon_fld_k8,
|
||||
athlon_fstxf_k8, athlon_fst_k8, athlon_fist, athlon_fmov,
|
||||
athlon_fadd_load, athlon_fadd_load_k8, athlon_fadd, athlon_fmul,
|
||||
athlon_fmul_load, athlon_fmul_load_k8, athlon_fsgn,
|
||||
athlon_fdiv_load, athlon_fdiv_load_k8, athlon_fdiv_k8,
|
||||
athlon_fpspc_load, athlon_fpspc, athlon_fcmov_load,
|
||||
athlon_fcmov_load_k8, athlon_fcmov_k8, athlon_fcomi_load_k8,
|
||||
athlon_fcomi, athlon_fcom_load_k8, athlon_fcom): Added amdfam10.
|
||||
|
||||
2007-02-05 Harsha Jagasia <harsha.jagasia@amd.com>
|
||||
|
||||
* config/i386/i386.md (x86_sahf_1, cmpfp_i_mixed, cmpfp_i_sse,
|
||||
cmpfp_i_i387, cmpfp_iu_mixed, cmpfp_iu_sse, cmpfp_iu_387,
|
||||
swapsi, swaphi_1, swapqi_1, swapdi_rex64, fix_truncsfdi_sse,
|
||||
fix_truncdfdi_sse, fix_truncsfsi_sse, fix_truncdfsi_sse,
|
||||
x86_fldcw_1, floatsisf2_mixed, floatsisf2_sse, floatdisf2_mixed,
|
||||
floatdisf2_sse, floatsidf2_mixed, floatsidf2_sse,
|
||||
floatdidf2_mixed, floatdidf2_sse, muldi3_1_rex64, mulsi3_1,
|
||||
mulsi3_1_zext, mulhi3_1, mulqi3_1, umulqihi3_1, mulqihi3_insn,
|
||||
umulditi3_insn, umulsidi3_insn, mulditi3_insn, mulsidi3_insn,
|
||||
umuldi3_highpart_rex64, umulsi3_highpart_insn,
|
||||
umulsi3_highpart_zext, smuldi3_highpart_rex64,
|
||||
smulsi3_highpart_insn, smulsi3_highpart_zext, x86_64_shld,
|
||||
x86_shld_1, x86_64_shrd, sqrtsf2_mixed, sqrtsf2_sse,
|
||||
sqrtsf2_i387, sqrtdf2_mixed, sqrtdf2_sse, sqrtdf2_i387,
|
||||
sqrtextendsfdf2_i387, sqrtxf2, sqrtextendsfxf2_i387,
|
||||
sqrtextenddfxf2_i387): Added amdfam10_decode.
|
||||
|
||||
* config/i386/athlon.md (athlon_idirect_amdfam10,
|
||||
athlon_ivector_amdfam10, athlon_idirect_load_amdfam10,
|
||||
athlon_ivector_load_amdfam10, athlon_idirect_both_amdfam10,
|
||||
athlon_ivector_both_amdfam10, athlon_idirect_store_amdfam10,
|
||||
athlon_ivector_store_amdfam10): New define_insn_reservation.
|
||||
(athlon_idirect_loadmov, athlon_idirect_movstore): Added
|
||||
amdfam10.
|
||||
|
||||
2007-02-05 Harsha Jagasia <harsha.jagasia@amd.com>
|
||||
|
||||
* config/i386/athlon.md (athlon_call_amdfam10,
|
||||
athlon_pop_amdfam10, athlon_lea_amdfam10): New
|
||||
define_insn_reservation.
|
||||
(athlon_branch, athlon_push, athlon_leave_k8, athlon_imul_k8,
|
||||
athlon_imul_k8_DI, athlon_imul_mem_k8, athlon_imul_mem_k8_DI,
|
||||
athlon_idiv, athlon_idiv_mem, athlon_str): Added amdfam10.
|
||||
|
||||
2007-02-05 Harsha Jagasia <harsha.jagasia@amd.com>
|
||||
|
||||
* config/i386/athlon.md (athlon_sseld_amdfam10,
|
||||
athlon_mmxld_amdfam10, athlon_ssest_amdfam10,
|
||||
athlon_mmxssest_short_amdfam10): New define_insn_reservation.
|
||||
|
||||
2007-02-05 Harsha Jagasia <harsha.jagasia@amd.com>
|
||||
|
||||
* config/i386/athlon.md (athlon_sseins_amdfam10): New
|
||||
define_insn_reservation.
|
||||
* config/i386/i386.md (sseins): Added sseins to define_attr type
|
||||
and define_attr unit.
|
||||
* config/i386/sse.md: Set type attribute to sseins for insertq
|
||||
and insertqi.
|
||||
|
||||
2007-02-05 Harsha Jagasia <harsha.jagasia@amd.com>
|
||||
|
||||
* config/i386/athlon.md (sselog_load_amdfam10, sselog_amdfam10,
|
||||
ssecmpvector_load_amdfam10, ssecmpvector_amdfam10,
|
||||
ssecomi_load_amdfam10, ssecomi_amdfam10,
|
||||
sseaddvector_load_amdfam10, sseaddvector_amdfam10): New
|
||||
define_insn_reservation.
|
||||
(ssecmp_load_k8, ssecmp, sseadd_load_k8, seadd): Added amdfam10.
|
||||
|
||||
2007-02-05 Harsha Jagasia <harsha.jagasia@amd.com>
|
||||
|
||||
* config/i386/athlon.md (cvtss2sd_load_amdfam10,
|
||||
cvtss2sd_amdfam10, cvtps2pd_load_amdfam10, cvtps2pd_amdfam10,
|
||||
cvtsi2sd_load_amdfam10, cvtsi2ss_load_amdfam10,
|
||||
cvtsi2sd_amdfam10, cvtsi2ss_amdfam10, cvtsd2ss_load_amdfam10,
|
||||
cvtsd2ss_amdfam10, cvtpd2ps_load_amdfam10, cvtpd2ps_amdfam10,
|
||||
cvtsX2si_load_amdfam10, cvtsX2si_amdfam10): New
|
||||
define_insn_reservation.
|
||||
|
||||
* config/i386/sse.md (cvtsi2ss, cvtsi2ssq, cvtss2si,
|
||||
cvtss2siq, cvttss2si, cvttss2siq, cvtsi2sd, cvtsi2sdq,
|
||||
cvtsd2si, cvtsd2siq, cvttsd2si, cvttsd2siq,
|
||||
cvtpd2dq, cvttpd2dq, cvtsd2ss, cvtss2sd,
|
||||
cvtpd2ps, cvtps2pd): Added amdfam10_decode attribute.
|
||||
|
||||
2007-02-05 Harsha Jagasia <harsha.jagasia@amd.com>
|
||||
|
||||
* config/i386/athlon.md (athlon_ssedivvector_amdfam10,
|
||||
athlon_ssedivvector_load_amdfam10, athlon_ssemulvector_amdfam10,
|
||||
athlon_ssemulvector_load_amdfam10): New define_insn_reservation.
|
||||
(athlon_ssediv, athlon_ssediv_load_k8, athlon_ssemul,
|
||||
athlon_ssemul_load_k8): Added amdfam10.
|
||||
|
||||
2007-02-05 Harsha Jagasia <harsha.jagasia@amd.com>
|
||||
|
||||
* config/i386/i386.h (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL): New macro.
|
||||
(x86_sse_unaligned_move_optimal): New variable.
|
||||
|
||||
* config/i386/i386.c (x86_sse_unaligned_move_optimal): Enable for
|
||||
m_AMDFAM10.
|
||||
(ix86_expand_vector_move_misalign): Add code to generate movupd/movups
|
||||
for unaligned vector SSE double/single precision loads for AMDFAM10.
|
||||
|
||||
2007-02-05 Harsha Jagasia <harsha.jagasia@amd.com>
|
||||
|
||||
* config/i386/i386.h (TARGET_AMDFAM10): New macro.
|
||||
(TARGET_CPU_CPP_BUILTINS): Add code for amdfam10.
|
||||
Define TARGET_CPU_DEFAULT_amdfam10.
|
||||
(TARGET_CPU_DEFAULT_NAMES): Add amdfam10.
|
||||
(processor_type): Add PROCESSOR_AMDFAM10.
|
||||
|
||||
* config/i386/i386.md: Add amdfam10 as a new cpu attribute to match
|
||||
processor_type in config/i386/i386.h.
|
||||
Enable imul peepholes for TARGET_AMDFAM10.
|
||||
|
||||
* config.gcc: Add support for --with-cpu option for amdfam10.
|
||||
|
||||
* config/i386/i386.c (amdfam10_cost): New variable.
|
||||
(m_AMDFAM10): New macro.
|
||||
(m_ATHLON_K8_AMDFAM10): New macro.
|
||||
(x86_use_leave, x86_push_memory, x86_movx, x86_unroll_strlen,
|
||||
x86_cmove, x86_3dnow_a, x86_deep_branch, x86_use_simode_fiop,
|
||||
x86_promote_QImode, x86_integer_DFmode_moves,
|
||||
x86_partial_reg_dependency, x86_memory_mismatch_stall,
|
||||
x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387,
|
||||
x86_sse_partial_reg_dependency, x86_sse_typeless_stores,
|
||||
x86_use_ffreep, x86_use_incdec, x86_four_jump_limit,
|
||||
x86_schedule, x86_use_bt, x86_cmpxchg16b, x86_pad_returns):
|
||||
Enable/disable for amdfam10.
|
||||
(override_options): Add amdfam10_cost to processor_target_table.
|
||||
Set up PROCESSOR_AMDFAM10 for amdfam10 entry in
|
||||
processor_alias_table.
|
||||
(ix86_issue_rate): Add PROCESSOR_AMDFAM10.
|
||||
(ix86_adjust_cost): Add code for amdfam10.
|
||||
|
||||
2007-02-05 Harsha Jagasia <harsha.jagasia@amd.com>
|
||||
|
||||
* config/i386/i386.opt: Add new Advanced Bit Manipulation (-mabm)
|
||||
instruction set feature flag. Add new (-mpopcnt) flag for popcnt
|
||||
instruction. Add new SSE4A (-msse4a) instruction set feature flag.
|
||||
* config/i386/i386.h: Add builtin definition for SSE4A.
|
||||
* config/i386/i386.md: Add support for ABM instructions
|
||||
(popcnt and lzcnt).
|
||||
* config/i386/sse.md: Add support for SSE4A instructions
|
||||
(movntss, movntsd, extrq, insertq).
|
||||
* config/i386/i386.c: Add support for ABM and SSE4A builtins.
|
||||
Add -march=amdfam10 flag.
|
||||
* config/i386/ammintrin.h: Add support for SSE4A intrinsics.
|
||||
* doc/invoke.texi: Add documentation on flags for sse4a, abm, popcnt
|
||||
and amdfam10.
|
||||
* doc/extend.texi: Add documentation for SSE4A builtins.
|
||||
|
||||
2007-02-05 Bob Wilson <bob.wilson@acm.org>
|
||||
|
||||
* config/xtensa/xtensa.c (constantpool_mem_p): Skip over SUBREGs.
|
||||
|
@ -272,12 +272,12 @@ xscale-*-*)
|
||||
i[34567]86-*-*)
|
||||
cpu_type=i386
|
||||
extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
|
||||
pmmintrin.h tmmintrin.h"
|
||||
pmmintrin.h tmmintrin.h ammintrin.h"
|
||||
;;
|
||||
x86_64-*-*)
|
||||
cpu_type=i386
|
||||
extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
|
||||
pmmintrin.h tmmintrin.h"
|
||||
pmmintrin.h tmmintrin.h ammintrin.h"
|
||||
need_64bit_hwint=yes
|
||||
;;
|
||||
ia64-*-*)
|
||||
@ -1111,14 +1111,14 @@ i[34567]86-*-linux* | i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu)
|
||||
# FIXME: -m64 for i[34567]86-*-* should be allowed just
|
||||
# like -m32 for x86_64-*-*.
|
||||
case X"${with_cpu}" in
|
||||
Xgeneric|Xcore2|Xnocona|Xx86-64|Xk8|Xopteron|Xathlon64|Xathlon-fx)
|
||||
Xgeneric|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xk8|Xopteron|Xathlon64|Xathlon-fx)
|
||||
;;
|
||||
X)
|
||||
with_cpu=generic
|
||||
;;
|
||||
*)
|
||||
echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2
|
||||
echo "generic core2 nocona x86-64 k8 opteron athlon64 athlon-fx" 1>&2
|
||||
echo "generic core2 nocona x86-64 amdfam10 k8 opteron athlon64 athlon-fx" 1>&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@ -1240,14 +1240,14 @@ i[34567]86-*-solaris2*)
|
||||
# FIXME: -m64 for i[34567]86-*-* should be allowed just
|
||||
# like -m32 for x86_64-*-*.
|
||||
case X"${with_cpu}" in
|
||||
Xgeneric|Xcore2|Xnocona|Xx86-64|Xk8|Xopteron|Xathlon64|Xathlon-fx)
|
||||
Xgeneric|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xk8|Xopteron|Xathlon64|Xathlon-fx)
|
||||
;;
|
||||
X)
|
||||
with_cpu=generic
|
||||
;;
|
||||
*)
|
||||
echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2
|
||||
echo "generic core2 nocona x86-64 k8 opteron athlon64 athlon-fx" 1>&2
|
||||
echo "generic core2 nocona x86-64 amdfam10 k8 opteron athlon64 athlon-fx" 1>&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@ -2568,6 +2568,9 @@ if test x$with_cpu = x ; then
|
||||
;;
|
||||
i686-*-* | i786-*-*)
|
||||
case ${target_noncanonical} in
|
||||
amdfam10-*)
|
||||
with_cpu=amdfam10
|
||||
;;
|
||||
k8-*|opteron-*|athlon_64-*)
|
||||
with_cpu=k8
|
||||
;;
|
||||
@ -2611,6 +2614,9 @@ if test x$with_cpu = x ; then
|
||||
;;
|
||||
x86_64-*-*)
|
||||
case ${target_noncanonical} in
|
||||
amdfam10-*)
|
||||
with_cpu=amdfam10
|
||||
;;
|
||||
k8-*|opteron-*|athlon_64-*)
|
||||
with_cpu=k8
|
||||
;;
|
||||
@ -2874,7 +2880,7 @@ case "${target}" in
|
||||
esac
|
||||
# OK
|
||||
;;
|
||||
"" | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic)
|
||||
"" | amdfam10 | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic)
|
||||
# OK
|
||||
;;
|
||||
*)
|
||||
|
73
gcc/config/i386/ammintrin.h
Normal file
73
gcc/config/i386/ammintrin.h
Normal file
@ -0,0 +1,73 @@
|
||||
/* Copyright (C) 2007 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GCC.
|
||||
|
||||
GCC is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2, or (at your option)
|
||||
any later version.
|
||||
|
||||
GCC is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with GCC; see the file COPYING. If not, write to
|
||||
the Free Software Foundation, 51 Franklin Street, Fifth Floor,
|
||||
Boston, MA 02110-1301, USA. */
|
||||
|
||||
/* As a special exception, if you include this header file into source
|
||||
files compiled by GCC, this header file does not by itself cause
|
||||
the resulting executable to be covered by the GNU General Public
|
||||
License. This exception does not however invalidate any other
|
||||
reasons why the executable file might be covered by the GNU General
|
||||
Public License. */
|
||||
|
||||
/* Implemented from the specification included in the AMD Programmers
|
||||
Manual Update, version 2.x */
|
||||
|
||||
#ifndef _AMMINTRIN_H_INCLUDED
|
||||
#define _AMMINTRIN_H_INCLUDED
|
||||
|
||||
#ifndef __SSE4A__
|
||||
# error "SSE4A instruction set not enabled"
|
||||
#else
|
||||
|
||||
/* We need definitions from the SSE3, SSE2 and SSE header files*/
|
||||
#include <pmmintrin.h>
|
||||
|
||||
static __inline void __attribute__((__always_inline__))
|
||||
_mm_stream_sd (double * __P, __m128d __Y)
|
||||
{
|
||||
__builtin_ia32_movntsd (__P, (__v2df) __Y);
|
||||
}
|
||||
|
||||
static __inline void __attribute__((__always_inline__))
|
||||
_mm_stream_ss (float * __P, __m128 __Y)
|
||||
{
|
||||
__builtin_ia32_movntss (__P, (__v4sf) __Y);
|
||||
}
|
||||
|
||||
static __inline __m128i __attribute__((__always_inline__))
|
||||
_mm_extract_si64 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y);
|
||||
}
|
||||
|
||||
#define _mm_extracti_si64(X, I, L) \
|
||||
((__m128i) __builtin_ia32_extrqi ((__v2di)(X), I, L))
|
||||
|
||||
static __inline __m128i __attribute__((__always_inline__))
|
||||
_mm_insert_si64 (__m128i __X,__m128i __Y)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y);
|
||||
}
|
||||
|
||||
#define _mm_inserti_si64(X, Y, I, L) \
|
||||
((__m128i) __builtin_ia32_insertqi ((__v2di)(X), (__v2di)(Y), I, L))
|
||||
|
||||
|
||||
#endif /* __SSE4A__ */
|
||||
|
||||
#endif /* _AMMINTRIN_H_INCLUDED */
|
@ -29,6 +29,8 @@
|
||||
(const_string "vector")]
|
||||
(const_string "direct")))
|
||||
|
||||
(define_attr "amdfam10_decode" "direct,vector,double"
|
||||
(const_string "direct"))
|
||||
;;
|
||||
;; decode0 decode1 decode2
|
||||
;; \ | /
|
||||
@ -131,18 +133,22 @@
|
||||
|
||||
;; Jump instructions are executed in the branch unit completely transparent to us
|
||||
(define_insn_reservation "athlon_branch" 0
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(eq_attr "type" "ibr"))
|
||||
"athlon-direct,athlon-ieu")
|
||||
(define_insn_reservation "athlon_call" 0
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(eq_attr "type" "call,callv"))
|
||||
"athlon-vector,athlon-ieu")
|
||||
(define_insn_reservation "athlon_call_amdfam10" 0
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(eq_attr "type" "call,callv"))
|
||||
"athlon-double,athlon-ieu")
|
||||
|
||||
;; Latency of push operation is 3 cycles, but ESP value is available
|
||||
;; earlier
|
||||
(define_insn_reservation "athlon_push" 2
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(eq_attr "type" "push"))
|
||||
"athlon-direct,athlon-agu,athlon-store")
|
||||
(define_insn_reservation "athlon_pop" 4
|
||||
@ -153,12 +159,16 @@
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(eq_attr "type" "pop"))
|
||||
"athlon-double,(athlon-ieu+athlon-load)")
|
||||
(define_insn_reservation "athlon_pop_amdfam10" 3
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(eq_attr "type" "pop"))
|
||||
"athlon-direct,(athlon-ieu+athlon-load)")
|
||||
(define_insn_reservation "athlon_leave" 3
|
||||
(and (eq_attr "cpu" "athlon")
|
||||
(eq_attr "type" "leave"))
|
||||
"athlon-vector,(athlon-ieu+athlon-load)")
|
||||
(define_insn_reservation "athlon_leave_k8" 3
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(eq_attr "type" "leave"))
|
||||
"athlon-double,(athlon-ieu+athlon-load)")
|
||||
|
||||
@ -167,6 +177,11 @@
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(eq_attr "type" "lea"))
|
||||
"athlon-direct,athlon-agu,nothing")
|
||||
;; Lea executes in AGU unit with 1 cycle latency on AMDFAM10
|
||||
(define_insn_reservation "athlon_lea_amdfam10" 1
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(eq_attr "type" "lea"))
|
||||
"athlon-direct,athlon-agu,nothing")
|
||||
|
||||
;; Mul executes in special multiplier unit attached to IEU0
|
||||
(define_insn_reservation "athlon_imul" 5
|
||||
@ -176,29 +191,35 @@
|
||||
"athlon-vector,athlon-ieu0,athlon-mult,nothing,nothing,athlon-ieu0")
|
||||
;; ??? Widening multiply is vector or double.
|
||||
(define_insn_reservation "athlon_imul_k8_DI" 4
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "imul")
|
||||
(and (eq_attr "mode" "DI")
|
||||
(eq_attr "memory" "none,unknown"))))
|
||||
"athlon-direct0,athlon-ieu0,athlon-mult,nothing,athlon-ieu0")
|
||||
(define_insn_reservation "athlon_imul_k8" 3
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "imul")
|
||||
(eq_attr "memory" "none,unknown")))
|
||||
"athlon-direct0,athlon-ieu0,athlon-mult,athlon-ieu0")
|
||||
(define_insn_reservation "athlon_imul_amdfam10_HI" 4
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "imul")
|
||||
(and (eq_attr "mode" "HI")
|
||||
(eq_attr "memory" "none,unknown"))))
|
||||
"athlon-vector,athlon-ieu0,athlon-mult,nothing,athlon-ieu0")
|
||||
(define_insn_reservation "athlon_imul_mem" 8
|
||||
(and (eq_attr "cpu" "athlon")
|
||||
(and (eq_attr "type" "imul")
|
||||
(eq_attr "memory" "load,both")))
|
||||
"athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,nothing,athlon-ieu")
|
||||
(define_insn_reservation "athlon_imul_mem_k8_DI" 7
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "imul")
|
||||
(and (eq_attr "mode" "DI")
|
||||
(eq_attr "memory" "load,both"))))
|
||||
"athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,athlon-ieu")
|
||||
(define_insn_reservation "athlon_imul_mem_k8" 6
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "imul")
|
||||
(eq_attr "memory" "load,both")))
|
||||
"athlon-vector,athlon-load,athlon-ieu,athlon-mult,athlon-ieu")
|
||||
@ -209,21 +230,23 @@
|
||||
;; other instructions.
|
||||
;; ??? Experiments show that the idiv can overlap with roughly 6 cycles
|
||||
;; of the other code
|
||||
;; Using the same heuristics for amdfam10 as K8 with idiv
|
||||
|
||||
(define_insn_reservation "athlon_idiv" 6
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "idiv")
|
||||
(eq_attr "memory" "none,unknown")))
|
||||
"athlon-vector,(athlon-ieu0*6+(athlon-fpsched,athlon-fvector))")
|
||||
(define_insn_reservation "athlon_idiv_mem" 9
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "idiv")
|
||||
(eq_attr "memory" "load,both")))
|
||||
"athlon-vector,((athlon-load,athlon-ieu0*6)+(athlon-fpsched,athlon-fvector))")
|
||||
;; The parallelism of string instructions is not documented. Model it same way
|
||||
;; as idiv to create smaller automata. This probably does not matter much.
|
||||
;; Using the same heuristics for amdfam10 as K8 with idiv
|
||||
(define_insn_reservation "athlon_str" 6
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "str")
|
||||
(eq_attr "memory" "load,both,store")))
|
||||
"athlon-vector,athlon-load,athlon-ieu0*6")
|
||||
@ -234,34 +257,62 @@
|
||||
(and (eq_attr "unit" "integer,unknown")
|
||||
(eq_attr "memory" "none,unknown"))))
|
||||
"athlon-direct,athlon-ieu")
|
||||
(define_insn_reservation "athlon_idirect_amdfam10" 1
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "amdfam10_decode" "direct")
|
||||
(and (eq_attr "unit" "integer,unknown")
|
||||
(eq_attr "memory" "none,unknown"))))
|
||||
"athlon-direct,athlon-ieu")
|
||||
(define_insn_reservation "athlon_ivector" 2
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "athlon_decode" "vector")
|
||||
(and (eq_attr "unit" "integer,unknown")
|
||||
(eq_attr "memory" "none,unknown"))))
|
||||
"athlon-vector,athlon-ieu,athlon-ieu")
|
||||
(define_insn_reservation "athlon_ivector_amdfam10" 2
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "amdfam10_decode" "vector")
|
||||
(and (eq_attr "unit" "integer,unknown")
|
||||
(eq_attr "memory" "none,unknown"))))
|
||||
"athlon-vector,athlon-ieu,athlon-ieu")
|
||||
|
||||
(define_insn_reservation "athlon_idirect_loadmov" 3
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "imov")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-load")
|
||||
|
||||
(define_insn_reservation "athlon_idirect_load" 4
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "athlon_decode" "direct")
|
||||
(and (eq_attr "unit" "integer,unknown")
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-direct,athlon-load,athlon-ieu")
|
||||
(define_insn_reservation "athlon_idirect_load_amdfam10" 4
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "amdfam10_decode" "direct")
|
||||
(and (eq_attr "unit" "integer,unknown")
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-direct,athlon-load,athlon-ieu")
|
||||
(define_insn_reservation "athlon_ivector_load" 6
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "athlon_decode" "vector")
|
||||
(and (eq_attr "unit" "integer,unknown")
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-vector,athlon-load,athlon-ieu,athlon-ieu")
|
||||
(define_insn_reservation "athlon_ivector_load_amdfam10" 6
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "amdfam10_decode" "vector")
|
||||
(and (eq_attr "unit" "integer,unknown")
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-vector,athlon-load,athlon-ieu,athlon-ieu")
|
||||
|
||||
(define_insn_reservation "athlon_idirect_movstore" 1
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "imov")
|
||||
(eq_attr "memory" "store")))
|
||||
"athlon-direct,athlon-agu,athlon-store")
|
||||
|
||||
(define_insn_reservation "athlon_idirect_both" 4
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "athlon_decode" "direct")
|
||||
@ -270,6 +321,15 @@
|
||||
"athlon-direct,athlon-load,
|
||||
athlon-ieu,athlon-store,
|
||||
athlon-store")
|
||||
(define_insn_reservation "athlon_idirect_both_amdfam10" 4
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "amdfam10_decode" "direct")
|
||||
(and (eq_attr "unit" "integer,unknown")
|
||||
(eq_attr "memory" "both"))))
|
||||
"athlon-direct,athlon-load,
|
||||
athlon-ieu,athlon-store,
|
||||
athlon-store")
|
||||
|
||||
(define_insn_reservation "athlon_ivector_both" 6
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "athlon_decode" "vector")
|
||||
@ -279,6 +339,16 @@
|
||||
athlon-ieu,
|
||||
athlon-ieu,
|
||||
athlon-store")
|
||||
(define_insn_reservation "athlon_ivector_both_amdfam10" 6
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "amdfam10_decode" "vector")
|
||||
(and (eq_attr "unit" "integer,unknown")
|
||||
(eq_attr "memory" "both"))))
|
||||
"athlon-vector,athlon-load,
|
||||
athlon-ieu,
|
||||
athlon-ieu,
|
||||
athlon-store")
|
||||
|
||||
(define_insn_reservation "athlon_idirect_store" 1
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "athlon_decode" "direct")
|
||||
@ -286,6 +356,14 @@
|
||||
(eq_attr "memory" "store"))))
|
||||
"athlon-direct,(athlon-ieu+athlon-agu),
|
||||
athlon-store")
|
||||
(define_insn_reservation "athlon_idirect_store_amdfam10" 1
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "amdfam10_decode" "direct")
|
||||
(and (eq_attr "unit" "integer,unknown")
|
||||
(eq_attr "memory" "store"))))
|
||||
"athlon-direct,(athlon-ieu+athlon-agu),
|
||||
athlon-store")
|
||||
|
||||
(define_insn_reservation "athlon_ivector_store" 2
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "athlon_decode" "vector")
|
||||
@ -293,6 +371,13 @@
|
||||
(eq_attr "memory" "store"))))
|
||||
"athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu,
|
||||
athlon-store")
|
||||
(define_insn_reservation "athlon_ivector_store_amdfam10" 2
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "amdfam10_decode" "vector")
|
||||
(and (eq_attr "unit" "integer,unknown")
|
||||
(eq_attr "memory" "store"))))
|
||||
"athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu,
|
||||
athlon-store")
|
||||
|
||||
;; Athlon floatin point unit
|
||||
(define_insn_reservation "athlon_fldxf" 12
|
||||
@ -302,7 +387,7 @@
|
||||
(eq_attr "mode" "XF"))))
|
||||
"athlon-vector,athlon-fpload2,athlon-fvector*9")
|
||||
(define_insn_reservation "athlon_fldxf_k8" 13
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "fmov")
|
||||
(and (eq_attr "memory" "load")
|
||||
(eq_attr "mode" "XF"))))
|
||||
@ -314,7 +399,7 @@
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fpload,athlon-fany")
|
||||
(define_insn_reservation "athlon_fld_k8" 2
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "fmov")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fstore")
|
||||
@ -326,7 +411,7 @@
|
||||
(eq_attr "mode" "XF"))))
|
||||
"athlon-vector,(athlon-fpsched+athlon-agu),(athlon-store2+(athlon-fvector*7))")
|
||||
(define_insn_reservation "athlon_fstxf_k8" 8
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "fmov")
|
||||
(and (eq_attr "memory" "store,both")
|
||||
(eq_attr "mode" "XF"))))
|
||||
@ -337,16 +422,16 @@
|
||||
(eq_attr "memory" "store,both")))
|
||||
"athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
|
||||
(define_insn_reservation "athlon_fst_k8" 2
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "fmov")
|
||||
(eq_attr "memory" "store,both")))
|
||||
"athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
|
||||
(define_insn_reservation "athlon_fist" 4
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(eq_attr "type" "fistp,fisttp"))
|
||||
"athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
|
||||
(define_insn_reservation "athlon_fmov" 2
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(eq_attr "type" "fmov"))
|
||||
"athlon-direct,athlon-fpsched,athlon-faddmul")
|
||||
(define_insn_reservation "athlon_fadd_load" 4
|
||||
@ -355,12 +440,12 @@
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fpload,athlon-fadd")
|
||||
(define_insn_reservation "athlon_fadd_load_k8" 6
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "fop")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fadd")
|
||||
(define_insn_reservation "athlon_fadd" 4
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(eq_attr "type" "fop"))
|
||||
"athlon-direct,athlon-fpsched,athlon-fadd")
|
||||
(define_insn_reservation "athlon_fmul_load" 4
|
||||
@ -369,16 +454,16 @@
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fpload,athlon-fmul")
|
||||
(define_insn_reservation "athlon_fmul_load_k8" 6
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "fmul")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fmul")
|
||||
(define_insn_reservation "athlon_fmul" 4
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(eq_attr "type" "fmul"))
|
||||
"athlon-direct,athlon-fpsched,athlon-fmul")
|
||||
(define_insn_reservation "athlon_fsgn" 2
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(eq_attr "type" "fsgn"))
|
||||
"athlon-direct,athlon-fpsched,athlon-fmul")
|
||||
(define_insn_reservation "athlon_fdiv_load" 24
|
||||
@ -387,7 +472,7 @@
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fpload,athlon-fmul")
|
||||
(define_insn_reservation "athlon_fdiv_load_k8" 13
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "fdiv")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fmul")
|
||||
@ -396,16 +481,16 @@
|
||||
(eq_attr "type" "fdiv"))
|
||||
"athlon-direct,athlon-fpsched,athlon-fmul")
|
||||
(define_insn_reservation "athlon_fdiv_k8" 11
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(eq_attr "type" "fdiv"))
|
||||
"athlon-direct,athlon-fpsched,athlon-fmul")
|
||||
(define_insn_reservation "athlon_fpspc_load" 103
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "fpspc")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-vector,athlon-fpload,athlon-fvector")
|
||||
(define_insn_reservation "athlon_fpspc" 100
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(eq_attr "type" "fpspc"))
|
||||
"athlon-vector,athlon-fpsched,athlon-fvector")
|
||||
(define_insn_reservation "athlon_fcmov_load" 7
|
||||
@ -418,12 +503,12 @@
|
||||
(eq_attr "type" "fcmov"))
|
||||
"athlon-vector,athlon-fpsched,athlon-fvector")
|
||||
(define_insn_reservation "athlon_fcmov_load_k8" 17
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "fcmov")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-vector,athlon-fploadk8,athlon-fvector")
|
||||
(define_insn_reservation "athlon_fcmov_k8" 15
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(eq_attr "type" "fcmov"))
|
||||
"athlon-vector,athlon-fpsched,athlon-fvector")
|
||||
;; fcomi is vector decoded by uses only one pipe.
|
||||
@ -434,13 +519,13 @@
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-vector,athlon-fpload,athlon-fadd")
|
||||
(define_insn_reservation "athlon_fcomi_load_k8" 5
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "fcmp")
|
||||
(and (eq_attr "athlon_decode" "vector")
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-vector,athlon-fploadk8,athlon-fadd")
|
||||
(define_insn_reservation "athlon_fcomi" 3
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(and (eq_attr "athlon_decode" "vector")
|
||||
(eq_attr "type" "fcmp")))
|
||||
"athlon-vector,athlon-fpsched,athlon-fadd")
|
||||
@ -450,18 +535,18 @@
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fpload,athlon-fadd")
|
||||
(define_insn_reservation "athlon_fcom_load_k8" 4
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "fcmp")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fadd")
|
||||
(define_insn_reservation "athlon_fcom" 2
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(eq_attr "type" "fcmp"))
|
||||
"athlon-direct,athlon-fpsched,athlon-fadd")
|
||||
;; Never seen by the scheduler because we still don't do post reg-stack
|
||||
;; scheduling.
|
||||
;(define_insn_reservation "athlon_fxch" 2
|
||||
; (and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
; (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
; (eq_attr "type" "fxch"))
|
||||
; "athlon-direct,athlon-fpsched,athlon-fany")
|
||||
|
||||
@ -516,6 +601,23 @@
|
||||
(and (eq_attr "type" "mmxmov,ssemov")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fstore")
|
||||
;; On AMDFAM10 all double, single and integer packed and scalar SSEx data
|
||||
;; loads generated are direct path, latency of 2 and do not use any FP
|
||||
;; executions units. No seperate entries for movlpx/movhpx loads, which
|
||||
;; are direct path, latency of 4 and use the FADD/FMUL FP execution units,
|
||||
;; as they will not be generated.
|
||||
(define_insn_reservation "athlon_sseld_amdfam10" 2
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssemov")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8")
|
||||
;; On AMDFAM10 MMX data loads generated are direct path, latency of 4
|
||||
;; and can use any FP executions units
|
||||
(define_insn_reservation "athlon_mmxld_amdfam10" 4
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "mmxmov")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8, athlon-fany")
|
||||
(define_insn_reservation "athlon_mmxssest" 3
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "type" "mmxmov,ssemov")
|
||||
@ -533,6 +635,25 @@
|
||||
(and (eq_attr "type" "mmxmov,ssemov")
|
||||
(eq_attr "memory" "store,both")))
|
||||
"athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
|
||||
;; On AMDFAM10 all double, single and integer packed SSEx data stores
|
||||
;; generated are all double path, latency of 2 and use the FSTORE FP
|
||||
;; execution unit. No entries seperate for movupx/movdqu, which are
|
||||
;; vector path, latency of 3 and use the FSTORE*2 FP execution unit,
|
||||
;; as they will not be generated.
|
||||
(define_insn_reservation "athlon_ssest_amdfam10" 2
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssemov")
|
||||
(and (eq_attr "mode" "V4SF,V2DF,TI")
|
||||
(eq_attr "memory" "store,both"))))
|
||||
"athlon-double,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store)*2)")
|
||||
;; On AMDFAM10 all double, single and integer scalar SSEx and MMX
|
||||
;; data stores generated are all direct path, latency of 2 and use
|
||||
;; the FSTORE FP execution unit
|
||||
(define_insn_reservation "athlon_mmxssest_short_amdfam10" 2
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "mmxmov,ssemov")
|
||||
(eq_attr "memory" "store,both")))
|
||||
"athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
|
||||
(define_insn_reservation "athlon_movaps_k8" 2
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "type" "ssemov")
|
||||
@ -578,6 +699,11 @@
|
||||
(and (eq_attr "type" "sselog,sselog1")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-double,athlon-fpload2k8,(athlon-fmul*2)")
|
||||
(define_insn_reservation "athlon_sselog_load_amdfam10" 4
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "sselog,sselog1")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8,(athlon-fadd|athlon-fmul)")
|
||||
(define_insn_reservation "athlon_sselog" 3
|
||||
(and (eq_attr "cpu" "athlon")
|
||||
(eq_attr "type" "sselog,sselog1"))
|
||||
@ -586,6 +712,11 @@
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(eq_attr "type" "sselog,sselog1"))
|
||||
"athlon-double,athlon-fpsched,athlon-fmul")
|
||||
(define_insn_reservation "athlon_sselog_amdfam10" 2
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(eq_attr "type" "sselog,sselog1"))
|
||||
"athlon-direct,athlon-fpsched,(athlon-fadd|athlon-fmul)")
|
||||
|
||||
;; ??? pcmp executes in addmul, probably not worthwhile to bother about that.
|
||||
(define_insn_reservation "athlon_ssecmp_load" 2
|
||||
(and (eq_attr "cpu" "athlon")
|
||||
@ -594,13 +725,13 @@
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-direct,athlon-fpload,athlon-fadd")
|
||||
(define_insn_reservation "athlon_ssecmp_load_k8" 4
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "ssecmp")
|
||||
(and (eq_attr "mode" "SF,DF,DI,TI")
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fadd")
|
||||
(define_insn_reservation "athlon_ssecmp" 2
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "ssecmp")
|
||||
(eq_attr "mode" "SF,DF,DI,TI")))
|
||||
"athlon-direct,athlon-fpsched,athlon-fadd")
|
||||
@ -614,6 +745,11 @@
|
||||
(and (eq_attr "type" "ssecmp")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-double,athlon-fpload2k8,(athlon-fadd*2)")
|
||||
(define_insn_reservation "athlon_ssecmpvector_load_amdfam10" 4
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssecmp")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fadd")
|
||||
(define_insn_reservation "athlon_ssecmpvector" 3
|
||||
(and (eq_attr "cpu" "athlon")
|
||||
(eq_attr "type" "ssecmp"))
|
||||
@ -622,6 +758,10 @@
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(eq_attr "type" "ssecmp"))
|
||||
"athlon-double,athlon-fpsched,(athlon-fadd*2)")
|
||||
(define_insn_reservation "athlon_ssecmpvector_amdfam10" 2
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(eq_attr "type" "ssecmp"))
|
||||
"athlon-direct,athlon-fpsched,athlon-fadd")
|
||||
(define_insn_reservation "athlon_ssecomi_load" 4
|
||||
(and (eq_attr "cpu" "athlon")
|
||||
(and (eq_attr "type" "ssecomi")
|
||||
@ -632,10 +772,20 @@
|
||||
(and (eq_attr "type" "ssecomi")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-vector,athlon-fploadk8,athlon-fadd")
|
||||
(define_insn_reservation "athlon_ssecomi_load_amdfam10" 5
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssecomi")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fadd")
|
||||
(define_insn_reservation "athlon_ssecomi" 4
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(eq_attr "type" "ssecmp"))
|
||||
"athlon-vector,athlon-fpsched,athlon-fadd")
|
||||
(define_insn_reservation "athlon_ssecomi_amdfam10" 3
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
;; It seems athlon_ssecomi has a bug in the attr_type, fixed for amdfam10
|
||||
(eq_attr "type" "ssecomi"))
|
||||
"athlon-direct,athlon-fpsched,athlon-fadd")
|
||||
(define_insn_reservation "athlon_sseadd_load" 4
|
||||
(and (eq_attr "cpu" "athlon")
|
||||
(and (eq_attr "type" "sseadd")
|
||||
@ -643,13 +793,13 @@
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-direct,athlon-fpload,athlon-fadd")
|
||||
(define_insn_reservation "athlon_sseadd_load_k8" 6
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "sseadd")
|
||||
(and (eq_attr "mode" "SF,DF,DI")
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fadd")
|
||||
(define_insn_reservation "athlon_sseadd" 4
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "sseadd")
|
||||
(eq_attr "mode" "SF,DF,DI")))
|
||||
"athlon-direct,athlon-fpsched,athlon-fadd")
|
||||
@ -663,6 +813,11 @@
|
||||
(and (eq_attr "type" "sseadd")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-double,athlon-fpload2k8,(athlon-fadd*2)")
|
||||
(define_insn_reservation "athlon_sseaddvector_load_amdfam10" 6
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "sseadd")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fadd")
|
||||
(define_insn_reservation "athlon_sseaddvector" 5
|
||||
(and (eq_attr "cpu" "athlon")
|
||||
(eq_attr "type" "sseadd"))
|
||||
@ -671,6 +826,10 @@
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(eq_attr "type" "sseadd"))
|
||||
"athlon-double,athlon-fpsched,(athlon-fadd*2)")
|
||||
(define_insn_reservation "athlon_sseaddvector_amdfam10" 4
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(eq_attr "type" "sseadd"))
|
||||
"athlon-direct,athlon-fpsched,athlon-fadd")
|
||||
|
||||
;; Conversions behaves very irregularly and the scheduling is critical here.
|
||||
;; Take each instruction separately. Assume that the mode is always set to the
|
||||
@ -684,12 +843,25 @@
|
||||
(and (eq_attr "mode" "DF")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fstore")
|
||||
(define_insn_reservation "athlon_ssecvt_cvtss2sd_load_amdfam10" 7
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssecvt")
|
||||
(and (eq_attr "amdfam10_decode" "double")
|
||||
(and (eq_attr "mode" "DF")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||||
(define_insn_reservation "athlon_ssecvt_cvtss2sd" 2
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "type" "ssecvt")
|
||||
(and (eq_attr "athlon_decode" "direct")
|
||||
(eq_attr "mode" "DF"))))
|
||||
"athlon-direct,athlon-fpsched,athlon-fstore")
|
||||
(define_insn_reservation "athlon_ssecvt_cvtss2sd_amdfam10" 7
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssecvt")
|
||||
(and (eq_attr "amdfam10_decode" "vector")
|
||||
(eq_attr "mode" "DF"))))
|
||||
"athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)")
|
||||
;; cvtps2pd. Model same way the other double decoded FP conversions.
|
||||
(define_insn_reservation "athlon_ssecvt_cvtps2pd_load_k8" 5
|
||||
(and (eq_attr "cpu" "k8,athlon,generic64")
|
||||
@ -698,12 +870,25 @@
|
||||
(and (eq_attr "mode" "V2DF,V4SF,TI")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-double,athlon-fpload2k8,(athlon-fstore*2)")
|
||||
(define_insn_reservation "athlon_ssecvt_cvtps2pd_load_amdfam10" 4
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssecvt")
|
||||
(and (eq_attr "amdfam10_decode" "direct")
|
||||
(and (eq_attr "mode" "V2DF,V4SF,TI")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fstore")
|
||||
(define_insn_reservation "athlon_ssecvt_cvtps2pd_k8" 3
|
||||
(and (eq_attr "cpu" "k8,athlon,generic64")
|
||||
(and (eq_attr "type" "ssecvt")
|
||||
(and (eq_attr "athlon_decode" "double")
|
||||
(eq_attr "mode" "V2DF,V4SF,TI"))))
|
||||
"athlon-double,athlon-fpsched,athlon-fstore,athlon-fstore")
|
||||
(define_insn_reservation "athlon_ssecvt_cvtps2pd_amdfam10" 2
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssecvt")
|
||||
(and (eq_attr "amdfam10_decode" "direct")
|
||||
(eq_attr "mode" "V2DF,V4SF,TI"))))
|
||||
"athlon-direct,athlon-fpsched,athlon-fstore")
|
||||
;; cvtsi2sd mem,reg is directpath path (cvtsi2sd reg,reg is doublepath)
|
||||
;; cvtsi2sd has troughput 1 and is executed in store unit with latency of 6
|
||||
(define_insn_reservation "athlon_sseicvt_cvtsi2sd_load" 6
|
||||
@ -713,6 +898,13 @@
|
||||
(and (eq_attr "mode" "SF,DF")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fstore")
|
||||
(define_insn_reservation "athlon_sseicvt_cvtsi2sd_load_amdfam10" 9
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "sseicvt")
|
||||
(and (eq_attr "amdfam10_decode" "double")
|
||||
(and (eq_attr "mode" "SF,DF")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||||
;; cvtsi2ss mem, reg is doublepath
|
||||
(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load" 9
|
||||
(and (eq_attr "cpu" "athlon")
|
||||
@ -728,6 +920,13 @@
|
||||
(and (eq_attr "mode" "SF,DF")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-double,athlon-fploadk8,(athlon-fstore*2)")
|
||||
(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load_amdfam10" 9
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "sseicvt")
|
||||
(and (eq_attr "amdfam10_decode" "double")
|
||||
(and (eq_attr "mode" "SF,DF")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||||
;; cvtsi2sd reg,reg is double decoded (vector on Athlon)
|
||||
(define_insn_reservation "athlon_sseicvt_cvtsi2sd_k8" 11
|
||||
(and (eq_attr "cpu" "k8,athlon,generic64")
|
||||
@ -736,6 +935,13 @@
|
||||
(and (eq_attr "mode" "SF,DF")
|
||||
(eq_attr "memory" "none")))))
|
||||
"athlon-double,athlon-fploadk8,athlon-fstore")
|
||||
(define_insn_reservation "athlon_sseicvt_cvtsi2sd_amdfam10" 14
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "sseicvt")
|
||||
(and (eq_attr "amdfam10_decode" "vector")
|
||||
(and (eq_attr "mode" "SF,DF")
|
||||
(eq_attr "memory" "none")))))
|
||||
"athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||||
;; cvtsi2ss reg, reg is doublepath
|
||||
(define_insn_reservation "athlon_sseicvt_cvtsi2ss" 14
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
@ -744,6 +950,13 @@
|
||||
(and (eq_attr "mode" "SF,DF")
|
||||
(eq_attr "memory" "none")))))
|
||||
"athlon-vector,athlon-fploadk8,(athlon-fvector*2)")
|
||||
(define_insn_reservation "athlon_sseicvt_cvtsi2ss_amdfam10" 14
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "sseicvt")
|
||||
(and (eq_attr "amdfam10_decode" "vector")
|
||||
(and (eq_attr "mode" "SF,DF")
|
||||
(eq_attr "memory" "none")))))
|
||||
"athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||||
;; cvtsd2ss mem,reg is doublepath, troughput unknown, latency 9
|
||||
(define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_k8" 9
|
||||
(and (eq_attr "cpu" "k8,athlon,generic64")
|
||||
@ -752,6 +965,13 @@
|
||||
(and (eq_attr "mode" "SF")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-double,athlon-fploadk8,(athlon-fstore*3)")
|
||||
(define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_amdfam10" 9
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssecvt")
|
||||
(and (eq_attr "amdfam10_decode" "double")
|
||||
(and (eq_attr "mode" "SF")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||||
;; cvtsd2ss reg,reg is vectorpath, troughput unknown, latency 12
|
||||
(define_insn_reservation "athlon_ssecvt_cvtsd2ss" 12
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
@ -760,6 +980,13 @@
|
||||
(and (eq_attr "mode" "SF")
|
||||
(eq_attr "memory" "none")))))
|
||||
"athlon-vector,athlon-fpsched,(athlon-fvector*3)")
|
||||
(define_insn_reservation "athlon_ssecvt_cvtsd2ss_amdfam10" 8
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssecvt")
|
||||
(and (eq_attr "amdfam10_decode" "vector")
|
||||
(and (eq_attr "mode" "SF")
|
||||
(eq_attr "memory" "none")))))
|
||||
"athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)")
|
||||
(define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_k8" 8
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "type" "ssecvt")
|
||||
@ -767,6 +994,13 @@
|
||||
(and (eq_attr "mode" "V4SF,V2DF,TI")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-double,athlon-fpload2k8,(athlon-fstore*3)")
|
||||
(define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_amdfam10" 9
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssecvt")
|
||||
(and (eq_attr "amdfam10_decode" "double")
|
||||
(and (eq_attr "mode" "V4SF,V2DF,TI")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||||
;; cvtpd2ps mem,reg is vectorpath, troughput unknown, latency 10
|
||||
;; ??? Why it is fater than cvtsd2ss?
|
||||
(define_insn_reservation "athlon_ssecvt_cvtpd2ps" 8
|
||||
@ -776,6 +1010,13 @@
|
||||
(and (eq_attr "mode" "V4SF,V2DF,TI")
|
||||
(eq_attr "memory" "none")))))
|
||||
"athlon-vector,athlon-fpsched,athlon-fvector*2")
|
||||
(define_insn_reservation "athlon_ssecvt_cvtpd2ps_amdfam10" 7
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssecvt")
|
||||
(and (eq_attr "amdfam10_decode" "double")
|
||||
(and (eq_attr "mode" "V4SF,V2DF,TI")
|
||||
(eq_attr "memory" "none")))))
|
||||
"athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)")
|
||||
;; cvtsd2si mem,reg is doublepath, troughput 1, latency 9
|
||||
(define_insn_reservation "athlon_secvt_cvtsX2si_load" 9
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
@ -784,6 +1025,13 @@
|
||||
(and (eq_attr "mode" "SI,DI")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-vector,athlon-fploadk8,athlon-fvector")
|
||||
(define_insn_reservation "athlon_secvt_cvtsX2si_load_amdfam10" 10
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "sseicvt")
|
||||
(and (eq_attr "amdfam10_decode" "double")
|
||||
(and (eq_attr "mode" "SI,DI")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-double,athlon-fploadk8,(athlon-fadd+athlon-fstore)")
|
||||
;; cvtsd2si reg,reg is doublepath, troughput 1, latency 9
|
||||
(define_insn_reservation "athlon_ssecvt_cvtsX2si" 9
|
||||
(and (eq_attr "cpu" "athlon")
|
||||
@ -799,6 +1047,29 @@
|
||||
(and (eq_attr "mode" "SI,DI")
|
||||
(eq_attr "memory" "none")))))
|
||||
"athlon-double,athlon-fpsched,athlon-fstore")
|
||||
(define_insn_reservation "athlon_ssecvt_cvtsX2si_amdfam10" 8
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "sseicvt")
|
||||
(and (eq_attr "amdfam10_decode" "double")
|
||||
(and (eq_attr "mode" "SI,DI")
|
||||
(eq_attr "memory" "none")))))
|
||||
"athlon-double,athlon-fpsched,(athlon-fadd+athlon-fstore)")
|
||||
;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 9 on amdfam10
|
||||
(define_insn_reservation "athlon_sseicvt_cvtpd2dq_load_amdfam10" 9
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "sseicvt")
|
||||
(and (eq_attr "amdfam10_decode" "double")
|
||||
(and (eq_attr "mode" "TI")
|
||||
(eq_attr "memory" "load")))))
|
||||
"athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
|
||||
;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 7 on amdfam10
|
||||
(define_insn_reservation "athlon_sseicvt_cvtpd2dq_amdfam10" 7
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "sseicvt")
|
||||
(and (eq_attr "amdfam10_decode" "double")
|
||||
(and (eq_attr "mode" "TI")
|
||||
(eq_attr "memory" "none")))))
|
||||
"athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)")
|
||||
|
||||
|
||||
(define_insn_reservation "athlon_ssemul_load" 4
|
||||
@ -808,13 +1079,13 @@
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-direct,athlon-fpload,athlon-fmul")
|
||||
(define_insn_reservation "athlon_ssemul_load_k8" 6
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "ssemul")
|
||||
(and (eq_attr "mode" "SF,DF")
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fmul")
|
||||
(define_insn_reservation "athlon_ssemul" 4
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "ssemul")
|
||||
(eq_attr "mode" "SF,DF")))
|
||||
"athlon-direct,athlon-fpsched,athlon-fmul")
|
||||
@ -828,6 +1099,11 @@
|
||||
(and (eq_attr "type" "ssemul")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-double,athlon-fpload2k8,(athlon-fmul*2)")
|
||||
(define_insn_reservation "athlon_ssemulvector_load_amdfam10" 6
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssemul")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fmul")
|
||||
(define_insn_reservation "athlon_ssemulvector" 5
|
||||
(and (eq_attr "cpu" "athlon")
|
||||
(eq_attr "type" "ssemul"))
|
||||
@ -836,6 +1112,10 @@
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(eq_attr "type" "ssemul"))
|
||||
"athlon-double,athlon-fpsched,(athlon-fmul*2)")
|
||||
(define_insn_reservation "athlon_ssemulvector_amdfam10" 4
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(eq_attr "type" "ssemul"))
|
||||
"athlon-direct,athlon-fpsched,athlon-fmul")
|
||||
;; divsd timings. divss is faster
|
||||
(define_insn_reservation "athlon_ssediv_load" 20
|
||||
(and (eq_attr "cpu" "athlon")
|
||||
@ -844,13 +1124,13 @@
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-direct,athlon-fpload,athlon-fmul*17")
|
||||
(define_insn_reservation "athlon_ssediv_load_k8" 22
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(and (eq_attr "cpu" "k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "ssediv")
|
||||
(and (eq_attr "mode" "SF,DF")
|
||||
(eq_attr "memory" "load"))))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fmul*17")
|
||||
(define_insn_reservation "athlon_ssediv" 20
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64")
|
||||
(and (eq_attr "cpu" "athlon,k8,generic64,amdfam10")
|
||||
(and (eq_attr "type" "ssediv")
|
||||
(eq_attr "mode" "SF,DF")))
|
||||
"athlon-direct,athlon-fpsched,athlon-fmul*17")
|
||||
@ -864,6 +1144,11 @@
|
||||
(and (eq_attr "type" "ssediv")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-double,athlon-fpload2k8,athlon-fmul*34")
|
||||
(define_insn_reservation "athlon_ssedivvector_load_amdfam10" 22
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "ssediv")
|
||||
(eq_attr "memory" "load")))
|
||||
"athlon-direct,athlon-fploadk8,athlon-fmul*17")
|
||||
(define_insn_reservation "athlon_ssedivvector" 39
|
||||
(and (eq_attr "cpu" "athlon")
|
||||
(eq_attr "type" "ssediv"))
|
||||
@ -872,3 +1157,12 @@
|
||||
(and (eq_attr "cpu" "k8,generic64")
|
||||
(eq_attr "type" "ssediv"))
|
||||
"athlon-double,athlon-fmul*34")
|
||||
(define_insn_reservation "athlon_ssedivvector_amdfam10" 20
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(eq_attr "type" "ssediv"))
|
||||
"athlon-direct,athlon-fmul*17")
|
||||
(define_insn_reservation "athlon_sseins_amdfam10" 5
|
||||
(and (eq_attr "cpu" "amdfam10")
|
||||
(and (eq_attr "type" "sseins")
|
||||
(eq_attr "mode" "TI")))
|
||||
"athlon-vector,athlon-fpsched,athlon-faddmul")
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
|
||||
/* Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GCC.
|
||||
|
||||
@ -30,7 +30,11 @@
|
||||
#ifndef _EMMINTRIN_H_INCLUDED
|
||||
#define _EMMINTRIN_H_INCLUDED
|
||||
|
||||
#ifdef __SSE2__
|
||||
#ifndef __SSE2__
|
||||
# error "SSE2 instruction set not enabled"
|
||||
#else
|
||||
|
||||
/* We need definitions from the SSE header files*/
|
||||
#include <xmmintrin.h>
|
||||
|
||||
/* SSE2 */
|
||||
|
@ -604,6 +604,80 @@ struct processor_costs k8_cost = {
|
||||
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
|
||||
};
|
||||
|
||||
struct processor_costs amdfam10_cost = {
|
||||
COSTS_N_INSNS (1), /* cost of an add instruction */
|
||||
COSTS_N_INSNS (2), /* cost of a lea instruction */
|
||||
COSTS_N_INSNS (1), /* variable shift costs */
|
||||
COSTS_N_INSNS (1), /* constant shift costs */
|
||||
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
|
||||
COSTS_N_INSNS (4), /* HI */
|
||||
COSTS_N_INSNS (3), /* SI */
|
||||
COSTS_N_INSNS (4), /* DI */
|
||||
COSTS_N_INSNS (5)}, /* other */
|
||||
0, /* cost of multiply per each bit set */
|
||||
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
|
||||
COSTS_N_INSNS (35), /* HI */
|
||||
COSTS_N_INSNS (51), /* SI */
|
||||
COSTS_N_INSNS (83), /* DI */
|
||||
COSTS_N_INSNS (83)}, /* other */
|
||||
COSTS_N_INSNS (1), /* cost of movsx */
|
||||
COSTS_N_INSNS (1), /* cost of movzx */
|
||||
8, /* "large" insn */
|
||||
9, /* MOVE_RATIO */
|
||||
4, /* cost for loading QImode using movzbl */
|
||||
{3, 4, 3}, /* cost of loading integer registers
|
||||
in QImode, HImode and SImode.
|
||||
Relative to reg-reg move (2). */
|
||||
{3, 4, 3}, /* cost of storing integer registers */
|
||||
4, /* cost of reg,reg fld/fst */
|
||||
{4, 4, 12}, /* cost of loading fp registers
|
||||
in SFmode, DFmode and XFmode */
|
||||
{6, 6, 8}, /* cost of storing fp registers
|
||||
in SFmode, DFmode and XFmode */
|
||||
2, /* cost of moving MMX register */
|
||||
{3, 3}, /* cost of loading MMX registers
|
||||
in SImode and DImode */
|
||||
{4, 4}, /* cost of storing MMX registers
|
||||
in SImode and DImode */
|
||||
2, /* cost of moving SSE register */
|
||||
{4, 4, 3}, /* cost of loading SSE registers
|
||||
in SImode, DImode and TImode */
|
||||
{4, 4, 5}, /* cost of storing SSE registers
|
||||
in SImode, DImode and TImode */
|
||||
3, /* MMX or SSE register to integer */
|
||||
/* On K8
|
||||
MOVD reg64, xmmreg Double FSTORE 4
|
||||
MOVD reg32, xmmreg Double FSTORE 4
|
||||
On AMDFAM10
|
||||
MOVD reg64, xmmreg Double FADD 3
|
||||
1/1 1/1
|
||||
MOVD reg32, xmmreg Double FADD 3
|
||||
1/1 1/1 */
|
||||
64, /* size of prefetch block */
|
||||
/* New AMD processors never drop prefetches; if they cannot be performed
|
||||
immediately, they are queued. We set number of simultaneous prefetches
|
||||
to a large constant to reflect this (it probably is not a good idea not
|
||||
to limit number of prefetches at all, as their execution also takes some
|
||||
time). */
|
||||
100, /* number of parallel prefetches */
|
||||
5, /* Branch cost */
|
||||
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
|
||||
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
|
||||
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
|
||||
COSTS_N_INSNS (2), /* cost of FABS instruction. */
|
||||
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
|
||||
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
|
||||
|
||||
/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
|
||||
very small blocks it is better to use loop. For large blocks, libcall can
|
||||
do nontemporary accesses and beat inline considerably. */
|
||||
{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
|
||||
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
|
||||
{{libcall, {{8, loop}, {24, unrolled_loop},
|
||||
{2048, rep_prefix_4_byte}, {-1, libcall}}},
|
||||
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
|
||||
};
|
||||
|
||||
static const
|
||||
struct processor_costs pentium4_cost = {
|
||||
COSTS_N_INSNS (1), /* cost of an add instruction */
|
||||
@ -917,11 +991,13 @@ const struct processor_costs *ix86_cost = &pentium_cost;
|
||||
#define m_PENT4 (1<<PROCESSOR_PENTIUM4)
|
||||
#define m_K8 (1<<PROCESSOR_K8)
|
||||
#define m_ATHLON_K8 (m_K8 | m_ATHLON)
|
||||
#define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
|
||||
#define m_NOCONA (1<<PROCESSOR_NOCONA)
|
||||
#define m_CORE2 (1<<PROCESSOR_CORE2)
|
||||
#define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
|
||||
#define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
|
||||
#define m_GENERIC (m_GENERIC32 | m_GENERIC64)
|
||||
#define m_ATHLON_K8_AMDFAM10 (m_K8 | m_ATHLON | m_AMDFAM10)
|
||||
|
||||
/* Generic instruction choice should be common subset of supported CPUs
|
||||
(PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
|
||||
@ -929,22 +1005,30 @@ const struct processor_costs *ix86_cost = &pentium_cost;
|
||||
/* Leave is not affecting Nocona SPEC2000 results negatively, so enabling for
|
||||
Generic64 seems like good code size tradeoff. We can't enable it for 32bit
|
||||
generic because it is not working well with PPro base chips. */
|
||||
const int x86_use_leave = m_386 | m_K6_GEODE | m_ATHLON_K8 | m_CORE2 | m_GENERIC64;
|
||||
const int x86_push_memory = m_386 | m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
const int x86_use_leave = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_CORE2
|
||||
| m_GENERIC64;
|
||||
const int x86_push_memory = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
|
||||
| m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
const int x86_zero_extend_with_and = m_486 | m_PENT;
|
||||
const int x86_movx = m_ATHLON_K8 | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */;
|
||||
/* Enable to zero extend integer registers to avoid partial dependencies */
|
||||
const int x86_movx = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
|
||||
| m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */;
|
||||
const int x86_double_with_add = ~m_386;
|
||||
const int x86_use_bit_test = m_386;
|
||||
const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8 | m_K6 | m_CORE2 | m_GENERIC;
|
||||
const int x86_cmove = m_PPRO | m_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA;
|
||||
const int x86_3dnow_a = m_ATHLON_K8;
|
||||
const int x86_deep_branch = m_PPRO | m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10
|
||||
| m_K6 | m_CORE2 | m_GENERIC;
|
||||
const int x86_cmove = m_PPRO | m_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
|
||||
| m_NOCONA;
|
||||
const int x86_3dnow_a = m_ATHLON_K8_AMDFAM10;
|
||||
const int x86_deep_branch = m_PPRO | m_K6_GEODE | m_ATHLON_K8_AMDFAM10
|
||||
| m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
/* Branch hints were put in P4 based on simulation result. But
|
||||
after P4 was made, no performance benefit was observed with
|
||||
branch hints. It also increases the code size. As the result,
|
||||
icc never generates branch hints. */
|
||||
const int x86_branch_hints = 0;
|
||||
const int x86_use_sahf = m_PPRO | m_K6_GEODE | m_PENT4 | m_NOCONA | m_GENERIC32; /*m_GENERIC | m_ATHLON_K8 ? */
|
||||
const int x86_use_sahf = m_PPRO | m_K6_GEODE | m_PENT4 | m_NOCONA | m_GENERIC32;
|
||||
/*m_GENERIC | m_ATHLON_K8 ? */
|
||||
/* We probably ought to watch for partial register stalls on Generic32
|
||||
compilation setting as well. However in current implementation the
|
||||
partial register stalls are not eliminated very well - they can
|
||||
@ -956,13 +1040,16 @@ const int x86_use_sahf = m_PPRO | m_K6_GEODE | m_PENT4 | m_NOCONA | m_GENERIC32;
|
||||
const int x86_partial_reg_stall = m_PPRO;
|
||||
const int x86_partial_flag_reg_stall = m_CORE2 | m_GENERIC;
|
||||
const int x86_use_himode_fiop = m_386 | m_486 | m_K6_GEODE;
|
||||
const int x86_use_simode_fiop = ~(m_PPRO | m_ATHLON_K8 | m_PENT | m_CORE2 | m_GENERIC);
|
||||
const int x86_use_simode_fiop = ~(m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT
|
||||
| m_CORE2 | m_GENERIC);
|
||||
const int x86_use_mov0 = m_K6;
|
||||
const int x86_use_cltd = ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC);
|
||||
const int x86_read_modify_write = ~m_PENT;
|
||||
const int x86_read_modify = ~(m_PENT | m_PPRO);
|
||||
const int x86_split_long_moves = m_PPRO;
|
||||
const int x86_promote_QImode = m_K6_GEODE | m_PENT | m_386 | m_486 | m_ATHLON_K8 | m_CORE2 | m_GENERIC; /* m_PENT4 ? */
|
||||
const int x86_promote_QImode = m_K6_GEODE | m_PENT | m_386 | m_486
|
||||
| m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC;
|
||||
/* m_PENT4 ? */
|
||||
const int x86_fast_prefix = ~(m_PENT | m_486 | m_386);
|
||||
const int x86_single_stringop = m_386 | m_PENT4 | m_NOCONA;
|
||||
const int x86_qimode_math = ~(0);
|
||||
@ -972,18 +1059,37 @@ const int x86_promote_qi_regs = 0;
|
||||
if our scheme for avoiding partial stalls was more effective. */
|
||||
const int x86_himode_math = ~(m_PPRO);
|
||||
const int x86_promote_hi_regs = m_PPRO;
|
||||
const int x86_sub_esp_4 = m_ATHLON_K8 | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
const int x86_sub_esp_8 = m_ATHLON_K8 | m_PPRO | m_386 | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
const int x86_add_esp_4 = m_ATHLON_K8 | m_K6_GEODE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
const int x86_add_esp_8 = m_ATHLON_K8 | m_PPRO | m_K6_GEODE | m_386 | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
const int x86_integer_DFmode_moves = ~(m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC | m_GEODE);
|
||||
const int x86_partial_reg_dependency = m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
const int x86_memory_mismatch_stall = m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
const int x86_accumulate_outgoing_args = m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
|
||||
/* Enable if add/sub rsp is preferred over 1 or 2 push/pop */
|
||||
const int x86_sub_esp_4 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
|
||||
| m_CORE2 | m_GENERIC;
|
||||
const int x86_sub_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_386 | m_486
|
||||
| m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
const int x86_add_esp_4 = m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT4 | m_NOCONA
|
||||
| m_CORE2 | m_GENERIC;
|
||||
const int x86_add_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_K6_GEODE | m_386
|
||||
| m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
/* Enable if integer moves are preferred for DFmode copies */
|
||||
const int x86_integer_DFmode_moves = ~(m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
|
||||
| m_PPRO | m_CORE2 | m_GENERIC | m_GEODE);
|
||||
const int x86_partial_reg_dependency = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
|
||||
| m_CORE2 | m_GENERIC;
|
||||
const int x86_memory_mismatch_stall = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
|
||||
| m_CORE2 | m_GENERIC;
|
||||
/* If ACCUMULATE_OUTGOING_ARGS is enabled, the maximum amount of space required
|
||||
for outgoing arguments will be computed and placed into the variable
|
||||
`current_function_outgoing_args_size'. No space will be pushed onto the stack
|
||||
for each call; instead, the function prologue should increase the stack frame
|
||||
size by this amount. Setting both PUSH_ARGS and ACCUMULATE_OUTGOING_ARGS is
|
||||
not proper. */
|
||||
const int x86_accumulate_outgoing_args = m_ATHLON_K8_AMDFAM10 | m_PENT4
|
||||
| m_NOCONA | m_PPRO | m_CORE2
|
||||
| m_GENERIC;
|
||||
const int x86_prologue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC;
|
||||
const int x86_epilogue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC;
|
||||
const int x86_shift1 = ~m_486;
|
||||
const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO
|
||||
| m_ATHLON_K8_AMDFAM10 | m_PENT4
|
||||
| m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
/* In Generic model we have an conflict here in between PPro/Pentium4 based chips
|
||||
that thread 128bit SSE registers as single units versus K8 based chips that
|
||||
divide SSE registers to two 64bit halves.
|
||||
@ -993,27 +1099,81 @@ const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_ATHLON_K8 | m_PEN
|
||||
this option on P4 brings over 20% SPECfp regression, while enabling it on
|
||||
K8 brings roughly 2.4% regression that can be partly masked by careful scheduling
|
||||
of moves. */
|
||||
const int x86_sse_partial_reg_dependency = m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
|
||||
const int x86_sse_partial_reg_dependency = m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
|
||||
| m_GENERIC | m_AMDFAM10;
|
||||
/* Set for machines where the type and dependencies are resolved on SSE
|
||||
register parts instead of whole registers, so we may maintain just
|
||||
lower part of scalar values in proper format leaving the upper part
|
||||
undefined. */
|
||||
const int x86_sse_split_regs = m_ATHLON_K8;
|
||||
const int x86_sse_typeless_stores = m_ATHLON_K8;
|
||||
/* Code generation for scalar reg-reg moves of single and double precision data:
|
||||
if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
|
||||
movaps reg, reg
|
||||
else
|
||||
movss reg, reg
|
||||
if (x86_sse_partial_reg_dependency == true)
|
||||
movapd reg, reg
|
||||
else
|
||||
movsd reg, reg
|
||||
|
||||
Code generation for scalar loads of double precision data:
|
||||
if (x86_sse_split_regs == true)
|
||||
movlpd mem, reg (gas syntax)
|
||||
else
|
||||
movsd mem, reg
|
||||
|
||||
Code generation for unaligned packed loads of single precision data
|
||||
(x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
|
||||
if (x86_sse_unaligned_move_optimal)
|
||||
movups mem, reg
|
||||
|
||||
if (x86_sse_partial_reg_dependency == true)
|
||||
{
|
||||
xorps reg, reg
|
||||
movlps mem, reg
|
||||
movhps mem+8, reg
|
||||
}
|
||||
else
|
||||
{
|
||||
movlps mem, reg
|
||||
movhps mem+8, reg
|
||||
}
|
||||
|
||||
Code generation for unaligned packed loads of double precision data
|
||||
(x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
|
||||
if (x86_sse_unaligned_move_optimal)
|
||||
movupd mem, reg
|
||||
|
||||
if (x86_sse_split_regs == true)
|
||||
{
|
||||
movlpd mem, reg
|
||||
movhpd mem+8, reg
|
||||
}
|
||||
else
|
||||
{
|
||||
movsd mem, reg
|
||||
movhpd mem+8, reg
|
||||
}
|
||||
*/
|
||||
const int x86_sse_unaligned_move_optimal = m_AMDFAM10;
|
||||
const int x86_sse_typeless_stores = m_ATHLON_K8_AMDFAM10;
|
||||
const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4 | m_NOCONA;
|
||||
const int x86_use_ffreep = m_ATHLON_K8;
|
||||
const int x86_use_ffreep = m_ATHLON_K8_AMDFAM10;
|
||||
const int x86_use_incdec = ~(m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC);
|
||||
|
||||
/* ??? Allowing interunit moves makes it all too easy for the compiler to put
|
||||
integer data in xmm registers. Which results in pretty abysmal code. */
|
||||
const int x86_inter_unit_moves = 0 /* ~(m_ATHLON_K8) */;
|
||||
|
||||
const int x86_ext_80387_constants = m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
|
||||
const int x86_ext_80387_constants = m_K6_GEODE | m_ATHLON_K8 | m_PENT4
|
||||
| m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
|
||||
/* Some CPU cores are not able to predict more than 4 branch instructions in
|
||||
the 16 byte window. */
|
||||
const int x86_four_jump_limit = m_PPRO | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
const int x86_schedule = m_PPRO | m_ATHLON_K8 | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC;
|
||||
const int x86_use_bt = m_ATHLON_K8;
|
||||
const int x86_four_jump_limit = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4
|
||||
| m_NOCONA | m_CORE2 | m_GENERIC;
|
||||
const int x86_schedule = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT
|
||||
| m_CORE2 | m_GENERIC;
|
||||
const int x86_use_bt = m_ATHLON_K8_AMDFAM10;
|
||||
/* Compare and exchange was added for 80486. */
|
||||
const int x86_cmpxchg = ~m_386;
|
||||
/* Compare and exchange 8 bytes was added for pentium. */
|
||||
@ -1022,7 +1182,7 @@ const int x86_cmpxchg8b = ~(m_386 | m_486);
|
||||
const int x86_xadd = ~m_386;
|
||||
/* Byteswap was added for 80486. */
|
||||
const int x86_bswap = ~m_386;
|
||||
const int x86_pad_returns = m_ATHLON_K8 | m_CORE2 | m_GENERIC;
|
||||
const int x86_pad_returns = m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC;
|
||||
|
||||
static enum stringop_alg stringop_alg = no_stringop;
|
||||
|
||||
@ -1600,16 +1760,24 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
|
||||
case OPT_msse:
|
||||
if (!value)
|
||||
{
|
||||
target_flags &= ~(MASK_SSE2 | MASK_SSE3);
|
||||
target_flags_explicit |= MASK_SSE2 | MASK_SSE3;
|
||||
target_flags &= ~(MASK_SSE2 | MASK_SSE3 | MASK_SSE4A);
|
||||
target_flags_explicit |= MASK_SSE2 | MASK_SSE3 | MASK_SSE4A;
|
||||
}
|
||||
return true;
|
||||
|
||||
case OPT_msse2:
|
||||
if (!value)
|
||||
{
|
||||
target_flags &= ~MASK_SSE3;
|
||||
target_flags_explicit |= MASK_SSE3;
|
||||
target_flags &= ~(MASK_SSE3 | MASK_SSE4A);
|
||||
target_flags_explicit |= MASK_SSE3 | MASK_SSE4A;
|
||||
}
|
||||
return true;
|
||||
|
||||
case OPT_msse3:
|
||||
if (!value)
|
||||
{
|
||||
target_flags &= ~MASK_SSE4A;
|
||||
target_flags_explicit |= MASK_SSE4A;
|
||||
}
|
||||
return true;
|
||||
|
||||
@ -1661,7 +1829,8 @@ override_options (void)
|
||||
{&nocona_cost, 0, 0, 0, 0, 0, 0, 0},
|
||||
{&core2_cost, 0, 0, 16, 7, 16, 7, 16},
|
||||
{&generic32_cost, 0, 0, 16, 7, 16, 7, 16},
|
||||
{&generic64_cost, 0, 0, 16, 7, 16, 7, 16}
|
||||
{&generic64_cost, 0, 0, 16, 7, 16, 7, 16},
|
||||
{&amdfam10_cost, 0, 0, 32, 7, 32, 7, 32}
|
||||
};
|
||||
|
||||
static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES;
|
||||
@ -1680,7 +1849,10 @@ override_options (void)
|
||||
PTA_3DNOW_A = 64,
|
||||
PTA_64BIT = 128,
|
||||
PTA_SSSE3 = 256,
|
||||
PTA_CX16 = 512
|
||||
PTA_CX16 = 512,
|
||||
PTA_POPCNT = 1024,
|
||||
PTA_ABM = 2048,
|
||||
PTA_SSE4A = 4096
|
||||
} flags;
|
||||
}
|
||||
const processor_alias_table[] =
|
||||
@ -1736,6 +1908,10 @@ override_options (void)
|
||||
| PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
|
||||
{"athlon-fx", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
|
||||
| PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
|
||||
{"amdfam10", PROCESSOR_AMDFAM10, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
|
||||
| PTA_64BIT | PTA_3DNOW_A | PTA_SSE
|
||||
| PTA_SSE2 | PTA_SSE3 | PTA_POPCNT
|
||||
| PTA_ABM | PTA_SSE4A | PTA_CX16},
|
||||
{"generic32", PROCESSOR_GENERIC32, 0 /* flags are only used for -march switch. */ },
|
||||
{"generic64", PROCESSOR_GENERIC64, PTA_64BIT /* flags are only used for -march switch. */ },
|
||||
};
|
||||
@ -1919,6 +2095,15 @@ override_options (void)
|
||||
x86_prefetch_sse = true;
|
||||
if (processor_alias_table[i].flags & PTA_CX16)
|
||||
x86_cmpxchg16b = true;
|
||||
if (processor_alias_table[i].flags & PTA_POPCNT
|
||||
&& !(target_flags_explicit & MASK_POPCNT))
|
||||
target_flags |= MASK_POPCNT;
|
||||
if (processor_alias_table[i].flags & PTA_ABM
|
||||
&& !(target_flags_explicit & MASK_ABM))
|
||||
target_flags |= MASK_ABM;
|
||||
if (processor_alias_table[i].flags & PTA_SSE4A
|
||||
&& !(target_flags_explicit & MASK_SSE4A))
|
||||
target_flags |= MASK_SSE4A;
|
||||
if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
|
||||
error ("CPU you selected does not support x86-64 "
|
||||
"instruction set");
|
||||
@ -2097,6 +2282,10 @@ override_options (void)
|
||||
if (TARGET_SSSE3)
|
||||
target_flags |= MASK_SSE3;
|
||||
|
||||
/* Turn on SSE3 builtins for -msse4a. */
|
||||
if (TARGET_SSE4A)
|
||||
target_flags |= MASK_SSE3;
|
||||
|
||||
/* Turn on SSE2 builtins for -msse3. */
|
||||
if (TARGET_SSE3)
|
||||
target_flags |= MASK_SSE2;
|
||||
@ -2116,6 +2305,10 @@ override_options (void)
|
||||
if (TARGET_3DNOW)
|
||||
target_flags |= MASK_MMX;
|
||||
|
||||
/* Turn on POPCNT builtins for -mabm. */
|
||||
if (TARGET_ABM)
|
||||
target_flags |= MASK_POPCNT;
|
||||
|
||||
if (TARGET_64BIT)
|
||||
{
|
||||
if (TARGET_ALIGN_DOUBLE)
|
||||
@ -9308,8 +9501,16 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
|
||||
}
|
||||
|
||||
if (TARGET_SSE2 && mode == V2DFmode)
|
||||
{
|
||||
rtx zero;
|
||||
{
|
||||
rtx zero;
|
||||
|
||||
if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
|
||||
{
|
||||
op0 = gen_lowpart (V2DFmode, op0);
|
||||
op1 = gen_lowpart (V2DFmode, op1);
|
||||
emit_insn (gen_sse2_movupd (op0, op1));
|
||||
return;
|
||||
}
|
||||
|
||||
/* When SSE registers are split into halves, we can avoid
|
||||
writing to the top half twice. */
|
||||
@ -9337,7 +9538,15 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
|
||||
emit_insn (gen_sse2_loadhpd (op0, op0, m));
|
||||
}
|
||||
else
|
||||
{
|
||||
{
|
||||
if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
|
||||
{
|
||||
op0 = gen_lowpart (V4SFmode, op0);
|
||||
op1 = gen_lowpart (V4SFmode, op1);
|
||||
emit_insn (gen_sse_movups (op0, op1));
|
||||
return;
|
||||
}
|
||||
|
||||
if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
|
||||
emit_move_insn (op0, CONST0_RTX (mode));
|
||||
else
|
||||
@ -14699,6 +14908,7 @@ ix86_issue_rate (void)
|
||||
case PROCESSOR_PENTIUM4:
|
||||
case PROCESSOR_ATHLON:
|
||||
case PROCESSOR_K8:
|
||||
case PROCESSOR_AMDFAM10:
|
||||
case PROCESSOR_NOCONA:
|
||||
case PROCESSOR_GENERIC32:
|
||||
case PROCESSOR_GENERIC64:
|
||||
@ -14897,6 +15107,7 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
|
||||
|
||||
case PROCESSOR_ATHLON:
|
||||
case PROCESSOR_K8:
|
||||
case PROCESSOR_AMDFAM10:
|
||||
case PROCESSOR_GENERIC32:
|
||||
case PROCESSOR_GENERIC64:
|
||||
memory = get_attr_memory (insn);
|
||||
@ -15609,6 +15820,14 @@ enum ix86_builtins
|
||||
IX86_BUILTIN_PABSW128,
|
||||
IX86_BUILTIN_PABSD128,
|
||||
|
||||
/* AMDFAM10 - SSE4A New Instructions. */
|
||||
IX86_BUILTIN_MOVNTSD,
|
||||
IX86_BUILTIN_MOVNTSS,
|
||||
IX86_BUILTIN_EXTRQI,
|
||||
IX86_BUILTIN_EXTRQ,
|
||||
IX86_BUILTIN_INSERTQI,
|
||||
IX86_BUILTIN_INSERTQ,
|
||||
|
||||
IX86_BUILTIN_VEC_INIT_V2SI,
|
||||
IX86_BUILTIN_VEC_INIT_V4HI,
|
||||
IX86_BUILTIN_VEC_INIT_V8QI,
|
||||
@ -16366,6 +16585,18 @@ ix86_init_mmx_sse_builtins (void)
|
||||
= build_function_type_list (void_type_node,
|
||||
pchar_type_node, V16QI_type_node, NULL_TREE);
|
||||
|
||||
tree v2di_ftype_v2di_unsigned_unsigned
|
||||
= build_function_type_list (V2DI_type_node, V2DI_type_node,
|
||||
unsigned_type_node, unsigned_type_node,
|
||||
NULL_TREE);
|
||||
tree v2di_ftype_v2di_v2di_unsigned_unsigned
|
||||
= build_function_type_list (V2DI_type_node, V2DI_type_node, V2DI_type_node,
|
||||
unsigned_type_node, unsigned_type_node,
|
||||
NULL_TREE);
|
||||
tree v2di_ftype_v2di_v16qi
|
||||
= build_function_type_list (V2DI_type_node, V2DI_type_node, V16QI_type_node,
|
||||
NULL_TREE);
|
||||
|
||||
tree float80_type;
|
||||
tree float128_type;
|
||||
tree ftype;
|
||||
@ -16702,6 +16933,20 @@ ix86_init_mmx_sse_builtins (void)
|
||||
def_builtin (MASK_SSSE3, "__builtin_ia32_palignr", di_ftype_di_di_int,
|
||||
IX86_BUILTIN_PALIGNR);
|
||||
|
||||
/* AMDFAM10 SSE4A New built-ins */
|
||||
def_builtin (MASK_SSE4A, "__builtin_ia32_movntsd",
|
||||
void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTSD);
|
||||
def_builtin (MASK_SSE4A, "__builtin_ia32_movntss",
|
||||
void_ftype_pfloat_v4sf, IX86_BUILTIN_MOVNTSS);
|
||||
def_builtin (MASK_SSE4A, "__builtin_ia32_extrqi",
|
||||
v2di_ftype_v2di_unsigned_unsigned, IX86_BUILTIN_EXTRQI);
|
||||
def_builtin (MASK_SSE4A, "__builtin_ia32_extrq",
|
||||
v2di_ftype_v2di_v16qi, IX86_BUILTIN_EXTRQ);
|
||||
def_builtin (MASK_SSE4A, "__builtin_ia32_insertqi",
|
||||
v2di_ftype_v2di_v2di_unsigned_unsigned, IX86_BUILTIN_INSERTQI);
|
||||
def_builtin (MASK_SSE4A, "__builtin_ia32_insertq",
|
||||
v2di_ftype_v2di_v2di, IX86_BUILTIN_INSERTQ);
|
||||
|
||||
/* Access to the vec_init patterns. */
|
||||
ftype = build_function_type_list (V2SI_type_node, integer_type_node,
|
||||
integer_type_node, NULL_TREE);
|
||||
@ -17190,9 +17435,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
|
||||
enum insn_code icode;
|
||||
tree fndecl = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
|
||||
tree arglist = TREE_OPERAND (exp, 1);
|
||||
tree arg0, arg1, arg2;
|
||||
rtx op0, op1, op2, pat;
|
||||
enum machine_mode tmode, mode0, mode1, mode2, mode3;
|
||||
tree arg0, arg1, arg2, arg3;
|
||||
rtx op0, op1, op2, op3, pat;
|
||||
enum machine_mode tmode, mode0, mode1, mode2, mode3, mode4;
|
||||
unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
|
||||
|
||||
switch (fcode)
|
||||
@ -17610,6 +17855,114 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
|
||||
emit_insn (pat);
|
||||
return target;
|
||||
|
||||
case IX86_BUILTIN_MOVNTSD:
|
||||
return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv2df, arglist);
|
||||
|
||||
case IX86_BUILTIN_MOVNTSS:
|
||||
return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv4sf, arglist);
|
||||
|
||||
case IX86_BUILTIN_INSERTQ:
|
||||
case IX86_BUILTIN_EXTRQ:
|
||||
icode = (fcode == IX86_BUILTIN_EXTRQ
|
||||
? CODE_FOR_sse4a_extrq
|
||||
: CODE_FOR_sse4a_insertq);
|
||||
arg0 = TREE_VALUE (arglist);
|
||||
arg1 = TREE_VALUE (TREE_CHAIN (arglist));
|
||||
op0 = expand_normal (arg0);
|
||||
op1 = expand_normal (arg1);
|
||||
tmode = insn_data[icode].operand[0].mode;
|
||||
mode1 = insn_data[icode].operand[1].mode;
|
||||
mode2 = insn_data[icode].operand[2].mode;
|
||||
if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
|
||||
op0 = copy_to_mode_reg (mode1, op0);
|
||||
if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
|
||||
op1 = copy_to_mode_reg (mode2, op1);
|
||||
if (optimize || target == 0
|
||||
|| GET_MODE (target) != tmode
|
||||
|| ! (*insn_data[icode].operand[0].predicate) (target, tmode))
|
||||
target = gen_reg_rtx (tmode);
|
||||
pat = GEN_FCN (icode) (target, op0, op1);
|
||||
if (! pat)
|
||||
return NULL_RTX;
|
||||
emit_insn (pat);
|
||||
return target;
|
||||
|
||||
case IX86_BUILTIN_EXTRQI:
|
||||
icode = CODE_FOR_sse4a_extrqi;
|
||||
arg0 = TREE_VALUE (arglist);
|
||||
arg1 = TREE_VALUE (TREE_CHAIN (arglist));
|
||||
arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
|
||||
op0 = expand_normal (arg0);
|
||||
op1 = expand_normal (arg1);
|
||||
op2 = expand_normal (arg2);
|
||||
tmode = insn_data[icode].operand[0].mode;
|
||||
mode1 = insn_data[icode].operand[1].mode;
|
||||
mode2 = insn_data[icode].operand[2].mode;
|
||||
mode3 = insn_data[icode].operand[3].mode;
|
||||
if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
|
||||
op0 = copy_to_mode_reg (mode1, op0);
|
||||
if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
|
||||
{
|
||||
error ("index mask must be an immediate");
|
||||
return gen_reg_rtx (tmode);
|
||||
}
|
||||
if (! (*insn_data[icode].operand[3].predicate) (op2, mode3))
|
||||
{
|
||||
error ("length mask must be an immediate");
|
||||
return gen_reg_rtx (tmode);
|
||||
}
|
||||
if (optimize || target == 0
|
||||
|| GET_MODE (target) != tmode
|
||||
|| ! (*insn_data[icode].operand[0].predicate) (target, tmode))
|
||||
target = gen_reg_rtx (tmode);
|
||||
pat = GEN_FCN (icode) (target, op0, op1, op2);
|
||||
if (! pat)
|
||||
return NULL_RTX;
|
||||
emit_insn (pat);
|
||||
return target;
|
||||
|
||||
case IX86_BUILTIN_INSERTQI:
|
||||
icode = CODE_FOR_sse4a_insertqi;
|
||||
arg0 = TREE_VALUE (arglist);
|
||||
arg1 = TREE_VALUE (TREE_CHAIN (arglist));
|
||||
arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
|
||||
arg3 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist))));
|
||||
op0 = expand_normal (arg0);
|
||||
op1 = expand_normal (arg1);
|
||||
op2 = expand_normal (arg2);
|
||||
op3 = expand_normal (arg3);
|
||||
tmode = insn_data[icode].operand[0].mode;
|
||||
mode1 = insn_data[icode].operand[1].mode;
|
||||
mode2 = insn_data[icode].operand[2].mode;
|
||||
mode3 = insn_data[icode].operand[3].mode;
|
||||
mode4 = insn_data[icode].operand[4].mode;
|
||||
|
||||
if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
|
||||
op0 = copy_to_mode_reg (mode1, op0);
|
||||
|
||||
if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
|
||||
op1 = copy_to_mode_reg (mode2, op1);
|
||||
|
||||
if (! (*insn_data[icode].operand[3].predicate) (op2, mode3))
|
||||
{
|
||||
error ("index mask must be an immediate");
|
||||
return gen_reg_rtx (tmode);
|
||||
}
|
||||
if (! (*insn_data[icode].operand[4].predicate) (op3, mode4))
|
||||
{
|
||||
error ("length mask must be an immediate");
|
||||
return gen_reg_rtx (tmode);
|
||||
}
|
||||
if (optimize || target == 0
|
||||
|| GET_MODE (target) != tmode
|
||||
|| ! (*insn_data[icode].operand[0].predicate) (target, tmode))
|
||||
target = gen_reg_rtx (tmode);
|
||||
pat = GEN_FCN (icode) (target, op0, op1, op2, op3);
|
||||
if (! pat)
|
||||
return NULL_RTX;
|
||||
emit_insn (pat);
|
||||
return target;
|
||||
|
||||
case IX86_BUILTIN_VEC_INIT_V2SI:
|
||||
case IX86_BUILTIN_VEC_INIT_V4HI:
|
||||
case IX86_BUILTIN_VEC_INIT_V8QI:
|
||||
|
@ -177,6 +177,7 @@ extern const struct processor_costs *ix86_cost;
|
||||
#define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32)
|
||||
#define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64)
|
||||
#define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64)
|
||||
#define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10)
|
||||
|
||||
#define TUNEMASK (1 << ix86_tune)
|
||||
extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and;
|
||||
@ -195,6 +196,7 @@ extern const int x86_accumulate_outgoing_args, x86_prologue_using_move;
|
||||
extern const int x86_epilogue_using_move, x86_decompose_lea;
|
||||
extern const int x86_arch_always_fancy_math_387, x86_shift1;
|
||||
extern const int x86_sse_partial_reg_dependency, x86_sse_split_regs;
|
||||
extern const int x86_sse_unaligned_move_optimal;
|
||||
extern const int x86_sse_typeless_stores, x86_sse_load0_by_pxor;
|
||||
extern const int x86_use_ffreep;
|
||||
extern const int x86_inter_unit_moves, x86_schedule;
|
||||
@ -244,6 +246,8 @@ extern int x86_prefetch_sse, x86_cmpxchg16b;
|
||||
#define TARGET_PARTIAL_REG_DEPENDENCY (x86_partial_reg_dependency & TUNEMASK)
|
||||
#define TARGET_SSE_PARTIAL_REG_DEPENDENCY \
|
||||
(x86_sse_partial_reg_dependency & TUNEMASK)
|
||||
#define TARGET_SSE_UNALIGNED_MOVE_OPTIMAL \
|
||||
(x86_sse_unaligned_move_optimal & TUNEMASK)
|
||||
#define TARGET_SSE_SPLIT_REGS (x86_sse_split_regs & TUNEMASK)
|
||||
#define TARGET_SSE_TYPELESS_STORES (x86_sse_typeless_stores & TUNEMASK)
|
||||
#define TARGET_SSE_LOAD0_BY_PXOR (x86_sse_load0_by_pxor & TUNEMASK)
|
||||
@ -436,6 +440,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
|
||||
} \
|
||||
else if (TARGET_K8) \
|
||||
builtin_define ("__tune_k8__"); \
|
||||
else if (TARGET_AMDFAM10) \
|
||||
builtin_define ("__tune_amdfam10__"); \
|
||||
else if (TARGET_PENTIUM4) \
|
||||
builtin_define ("__tune_pentium4__"); \
|
||||
else if (TARGET_NOCONA) \
|
||||
@ -457,6 +463,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
|
||||
builtin_define ("__SSE3__"); \
|
||||
if (TARGET_SSSE3) \
|
||||
builtin_define ("__SSSE3__"); \
|
||||
if (TARGET_SSE4A) \
|
||||
builtin_define ("__SSE4A__"); \
|
||||
if (TARGET_SSE_MATH && TARGET_SSE) \
|
||||
builtin_define ("__SSE_MATH__"); \
|
||||
if (TARGET_SSE_MATH && TARGET_SSE2) \
|
||||
@ -512,6 +520,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
|
||||
builtin_define ("__k8"); \
|
||||
builtin_define ("__k8__"); \
|
||||
} \
|
||||
else if (ix86_arch == PROCESSOR_AMDFAM10) \
|
||||
{ \
|
||||
builtin_define ("__amdfam10"); \
|
||||
builtin_define ("__amdfam10__"); \
|
||||
} \
|
||||
else if (ix86_arch == PROCESSOR_PENTIUM4) \
|
||||
{ \
|
||||
builtin_define ("__pentium4"); \
|
||||
@ -550,13 +563,14 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
|
||||
#define TARGET_CPU_DEFAULT_nocona 17
|
||||
#define TARGET_CPU_DEFAULT_core2 18
|
||||
#define TARGET_CPU_DEFAULT_generic 19
|
||||
#define TARGET_CPU_DEFAULT_amdfam10 20
|
||||
|
||||
#define TARGET_CPU_DEFAULT_NAMES {"i386", "i486", "pentium", "pentium-mmx",\
|
||||
"pentiumpro", "pentium2", "pentium3", \
|
||||
"pentium4", "geode", "k6", "k6-2", "k6-3", \
|
||||
"athlon", "athlon-4", "k8", \
|
||||
"pentium-m", "prescott", "nocona", \
|
||||
"core2", "generic"}
|
||||
"core2", "generic", "amdfam10"}
|
||||
|
||||
#ifndef CC1_SPEC
|
||||
#define CC1_SPEC "%(cc1_cpu) "
|
||||
@ -2105,6 +2119,7 @@ enum processor_type
|
||||
PROCESSOR_CORE2,
|
||||
PROCESSOR_GENERIC32,
|
||||
PROCESSOR_GENERIC64,
|
||||
PROCESSOR_AMDFAM10,
|
||||
PROCESSOR_max
|
||||
};
|
||||
|
||||
|
@ -154,6 +154,12 @@
|
||||
(UNSPEC_PSHUFB 120)
|
||||
(UNSPEC_PSIGN 121)
|
||||
(UNSPEC_PALIGNR 122)
|
||||
|
||||
; For SSE4A support
|
||||
(UNSPEC_EXTRQI 130)
|
||||
(UNSPEC_EXTRQ 131)
|
||||
(UNSPEC_INSERTQI 132)
|
||||
(UNSPEC_INSERTQ 133)
|
||||
])
|
||||
|
||||
(define_constants
|
||||
@ -195,7 +201,8 @@
|
||||
|
||||
;; Processor type. This attribute must exactly match the processor_type
|
||||
;; enumeration in i386.h.
|
||||
(define_attr "cpu" "i386,i486,pentium,pentiumpro,geode,k6,athlon,pentium4,k8,nocona,core2,generic32,generic64"
|
||||
(define_attr "cpu" "i386,i486,pentium,pentiumpro,geode,k6,athlon,pentium4,k8,
|
||||
nocona,core2,generic32,generic64,amdfam10"
|
||||
(const (symbol_ref "ix86_tune")))
|
||||
|
||||
;; A basic instruction type. Refinements due to arguments to be
|
||||
@ -206,10 +213,10 @@
|
||||
incdec,ishift,ishift1,rotate,rotate1,imul,idiv,
|
||||
icmp,test,ibr,setcc,icmov,
|
||||
push,pop,call,callv,leave,
|
||||
str,
|
||||
str,bitmanip,
|
||||
fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint,
|
||||
sselog,sselog1,sseiadd,sseishft,sseimul,
|
||||
sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,
|
||||
sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins,
|
||||
mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft"
|
||||
(const_string "other"))
|
||||
|
||||
@ -223,7 +230,7 @@
|
||||
(cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint")
|
||||
(const_string "i387")
|
||||
(eq_attr "type" "sselog,sselog1,sseiadd,sseishft,sseimul,
|
||||
sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv")
|
||||
sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins")
|
||||
(const_string "sse")
|
||||
(eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft")
|
||||
(const_string "mmx")
|
||||
@ -233,7 +240,8 @@
|
||||
|
||||
;; The (bounding maximum) length of an instruction immediate.
|
||||
(define_attr "length_immediate" ""
|
||||
(cond [(eq_attr "type" "incdec,setcc,icmov,str,lea,other,multi,idiv,leave")
|
||||
(cond [(eq_attr "type" "incdec,setcc,icmov,str,lea,other,multi,idiv,leave,
|
||||
bitmanip")
|
||||
(const_int 0)
|
||||
(eq_attr "unit" "i387,sse,mmx")
|
||||
(const_int 0)
|
||||
@ -287,7 +295,7 @@
|
||||
;; Set when 0f opcode prefix is used.
|
||||
(define_attr "prefix_0f" ""
|
||||
(if_then_else
|
||||
(ior (eq_attr "type" "imovx,setcc,icmov")
|
||||
(ior (eq_attr "type" "imovx,setcc,icmov,bitmanip")
|
||||
(eq_attr "unit" "sse,mmx"))
|
||||
(const_int 1)
|
||||
(const_int 0)))
|
||||
@ -416,7 +424,7 @@
|
||||
(const_string "load")
|
||||
(and (eq_attr "type"
|
||||
"!alu1,negnot,ishift1,
|
||||
imov,imovx,icmp,test,
|
||||
imov,imovx,icmp,test,bitmanip,
|
||||
fmov,fcmp,fsgn,
|
||||
sse,ssemov,ssecmp,ssecomi,ssecvt,sseicvt,sselog1,
|
||||
mmx,mmxmov,mmxcmp,mmxcvt")
|
||||
@ -977,10 +985,11 @@
|
||||
"sahf"
|
||||
[(set_attr "length" "1")
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "direct")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
;; Pentium Pro can do steps 1 through 3 in one go.
|
||||
|
||||
;; comi*, ucomi*, fcomi*, ficomi*,fucomi* (i387 instructions set condition codes)
|
||||
(define_insn "*cmpfp_i_mixed"
|
||||
[(set (reg:CCFP FLAGS_REG)
|
||||
(compare:CCFP (match_operand 0 "register_operand" "f,x")
|
||||
@ -994,7 +1003,8 @@
|
||||
(if_then_else (match_operand:SF 1 "" "")
|
||||
(const_string "SF")
|
||||
(const_string "DF")))
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "direct")])
|
||||
|
||||
(define_insn "*cmpfp_i_sse"
|
||||
[(set (reg:CCFP FLAGS_REG)
|
||||
@ -1009,7 +1019,8 @@
|
||||
(if_then_else (match_operand:SF 1 "" "")
|
||||
(const_string "SF")
|
||||
(const_string "DF")))
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "direct")])
|
||||
|
||||
(define_insn "*cmpfp_i_i387"
|
||||
[(set (reg:CCFP FLAGS_REG)
|
||||
@ -1028,7 +1039,8 @@
|
||||
(const_string "DF")
|
||||
]
|
||||
(const_string "XF")))
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "direct")])
|
||||
|
||||
(define_insn "*cmpfp_iu_mixed"
|
||||
[(set (reg:CCFPU FLAGS_REG)
|
||||
@ -1043,7 +1055,8 @@
|
||||
(if_then_else (match_operand:SF 1 "" "")
|
||||
(const_string "SF")
|
||||
(const_string "DF")))
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "direct")])
|
||||
|
||||
(define_insn "*cmpfp_iu_sse"
|
||||
[(set (reg:CCFPU FLAGS_REG)
|
||||
@ -1058,7 +1071,8 @@
|
||||
(if_then_else (match_operand:SF 1 "" "")
|
||||
(const_string "SF")
|
||||
(const_string "DF")))
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "direct")])
|
||||
|
||||
(define_insn "*cmpfp_iu_387"
|
||||
[(set (reg:CCFPU FLAGS_REG)
|
||||
@ -1077,7 +1091,8 @@
|
||||
(const_string "DF")
|
||||
]
|
||||
(const_string "XF")))
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "direct")])
|
||||
|
||||
;; Move instructions.
|
||||
|
||||
@ -1283,7 +1298,8 @@
|
||||
[(set_attr "type" "imov")
|
||||
(set_attr "mode" "SI")
|
||||
(set_attr "pent_pair" "np")
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "double")])
|
||||
|
||||
(define_expand "movhi"
|
||||
[(set (match_operand:HI 0 "nonimmediate_operand" "")
|
||||
@ -1400,8 +1416,10 @@
|
||||
[(set_attr "type" "imov")
|
||||
(set_attr "mode" "SI")
|
||||
(set_attr "pent_pair" "np")
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "double")])
|
||||
|
||||
;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL is disabled for AMDFAM10
|
||||
(define_insn "*swaphi_2"
|
||||
[(set (match_operand:HI 0 "register_operand" "+r")
|
||||
(match_operand:HI 1 "register_operand" "+r"))
|
||||
@ -1574,8 +1592,10 @@
|
||||
[(set_attr "type" "imov")
|
||||
(set_attr "mode" "SI")
|
||||
(set_attr "pent_pair" "np")
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "vector")])
|
||||
|
||||
;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL is disabled for AMDFAM10
|
||||
(define_insn "*swapqi_2"
|
||||
[(set (match_operand:QI 0 "register_operand" "+q")
|
||||
(match_operand:QI 1 "register_operand" "+q"))
|
||||
@ -2139,7 +2159,8 @@
|
||||
[(set_attr "type" "imov")
|
||||
(set_attr "mode" "DI")
|
||||
(set_attr "pent_pair" "np")
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "double")])
|
||||
|
||||
(define_expand "movti"
|
||||
[(set (match_operand:TI 0 "nonimmediate_operand" "")
|
||||
@ -4179,7 +4200,8 @@
|
||||
"cvttss2si{q}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "mode" "SF")
|
||||
(set_attr "athlon_decode" "double,vector")])
|
||||
(set_attr "athlon_decode" "double,vector")
|
||||
(set_attr "amdfam10_decode" "double,double")])
|
||||
|
||||
(define_insn "fix_truncdfdi_sse"
|
||||
[(set (match_operand:DI 0 "register_operand" "=r,r")
|
||||
@ -4188,7 +4210,8 @@
|
||||
"cvttsd2si{q}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "mode" "DF")
|
||||
(set_attr "athlon_decode" "double,vector")])
|
||||
(set_attr "athlon_decode" "double,vector")
|
||||
(set_attr "amdfam10_decode" "double,double")])
|
||||
|
||||
(define_insn "fix_truncsfsi_sse"
|
||||
[(set (match_operand:SI 0 "register_operand" "=r,r")
|
||||
@ -4197,7 +4220,8 @@
|
||||
"cvttss2si\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "mode" "DF")
|
||||
(set_attr "athlon_decode" "double,vector")])
|
||||
(set_attr "athlon_decode" "double,vector")
|
||||
(set_attr "amdfam10_decode" "double,double")])
|
||||
|
||||
(define_insn "fix_truncdfsi_sse"
|
||||
[(set (match_operand:SI 0 "register_operand" "=r,r")
|
||||
@ -4206,7 +4230,8 @@
|
||||
"cvttsd2si\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "mode" "DF")
|
||||
(set_attr "athlon_decode" "double,vector")])
|
||||
(set_attr "athlon_decode" "double,vector")
|
||||
(set_attr "amdfam10_decode" "double,double")])
|
||||
|
||||
;; Shorten x87->SSE reload sequences of fix_trunc?f?i_sse patterns.
|
||||
(define_peephole2
|
||||
@ -4488,7 +4513,8 @@
|
||||
[(set_attr "length" "2")
|
||||
(set_attr "mode" "HI")
|
||||
(set_attr "unit" "i387")
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "vector")])
|
||||
|
||||
;; Conversion between fixed point and floating point.
|
||||
|
||||
@ -4539,6 +4565,7 @@
|
||||
(set_attr "mode" "SF")
|
||||
(set_attr "unit" "*,i387,*,*")
|
||||
(set_attr "athlon_decode" "*,*,vector,double")
|
||||
(set_attr "amdfam10_decode" "*,*,vector,double")
|
||||
(set_attr "fp_int_src" "true")])
|
||||
|
||||
(define_insn "*floatsisf2_sse"
|
||||
@ -4549,6 +4576,7 @@
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "mode" "SF")
|
||||
(set_attr "athlon_decode" "vector,double")
|
||||
(set_attr "amdfam10_decode" "vector,double")
|
||||
(set_attr "fp_int_src" "true")])
|
||||
|
||||
(define_insn "*floatsisf2_i387"
|
||||
@ -4582,6 +4610,7 @@
|
||||
(set_attr "mode" "SF")
|
||||
(set_attr "unit" "*,i387,*,*")
|
||||
(set_attr "athlon_decode" "*,*,vector,double")
|
||||
(set_attr "amdfam10_decode" "*,*,vector,double")
|
||||
(set_attr "fp_int_src" "true")])
|
||||
|
||||
(define_insn "*floatdisf2_sse"
|
||||
@ -4592,6 +4621,7 @@
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "mode" "SF")
|
||||
(set_attr "athlon_decode" "vector,double")
|
||||
(set_attr "amdfam10_decode" "vector,double")
|
||||
(set_attr "fp_int_src" "true")])
|
||||
|
||||
(define_insn "*floatdisf2_i387"
|
||||
@ -4650,6 +4680,7 @@
|
||||
(set_attr "mode" "DF")
|
||||
(set_attr "unit" "*,i387,*,*")
|
||||
(set_attr "athlon_decode" "*,*,double,direct")
|
||||
(set_attr "amdfam10_decode" "*,*,vector,double")
|
||||
(set_attr "fp_int_src" "true")])
|
||||
|
||||
(define_insn "*floatsidf2_sse"
|
||||
@ -4660,6 +4691,7 @@
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "mode" "DF")
|
||||
(set_attr "athlon_decode" "double,direct")
|
||||
(set_attr "amdfam10_decode" "vector,double")
|
||||
(set_attr "fp_int_src" "true")])
|
||||
|
||||
(define_insn "*floatsidf2_i387"
|
||||
@ -4693,6 +4725,7 @@
|
||||
(set_attr "mode" "DF")
|
||||
(set_attr "unit" "*,i387,*,*")
|
||||
(set_attr "athlon_decode" "*,*,double,direct")
|
||||
(set_attr "amdfam10_decode" "*,*,vector,double")
|
||||
(set_attr "fp_int_src" "true")])
|
||||
|
||||
(define_insn "*floatdidf2_sse"
|
||||
@ -4703,6 +4736,7 @@
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "mode" "DF")
|
||||
(set_attr "athlon_decode" "double,direct")
|
||||
(set_attr "amdfam10_decode" "vector,double")
|
||||
(set_attr "fp_int_src" "true")])
|
||||
|
||||
(define_insn "*floatdidf2_i387"
|
||||
@ -6910,6 +6944,14 @@
|
||||
"TARGET_64BIT"
|
||||
"")
|
||||
|
||||
;; On AMDFAM10
|
||||
;; IMUL reg64, reg64, imm8 Direct
|
||||
;; IMUL reg64, mem64, imm8 VectorPath
|
||||
;; IMUL reg64, reg64, imm32 Direct
|
||||
;; IMUL reg64, mem64, imm32 VectorPath
|
||||
;; IMUL reg64, reg64 Direct
|
||||
;; IMUL reg64, mem64 Direct
|
||||
|
||||
(define_insn "*muldi3_1_rex64"
|
||||
[(set (match_operand:DI 0 "register_operand" "=r,r,r")
|
||||
(mult:DI (match_operand:DI 1 "nonimmediate_operand" "%rm,rm,0")
|
||||
@ -6932,6 +6974,11 @@
|
||||
(match_operand 1 "memory_operand" ""))
|
||||
(const_string "vector")]
|
||||
(const_string "direct")))
|
||||
(set (attr "amdfam10_decode")
|
||||
(cond [(and (eq_attr "alternative" "0,1")
|
||||
(match_operand 1 "memory_operand" ""))
|
||||
(const_string "vector")]
|
||||
(const_string "direct")))
|
||||
(set_attr "mode" "DI")])
|
||||
|
||||
(define_expand "mulsi3"
|
||||
@ -6942,6 +6989,14 @@
|
||||
""
|
||||
"")
|
||||
|
||||
;; On AMDFAM10
|
||||
;; IMUL reg32, reg32, imm8 Direct
|
||||
;; IMUL reg32, mem32, imm8 VectorPath
|
||||
;; IMUL reg32, reg32, imm32 Direct
|
||||
;; IMUL reg32, mem32, imm32 VectorPath
|
||||
;; IMUL reg32, reg32 Direct
|
||||
;; IMUL reg32, mem32 Direct
|
||||
|
||||
(define_insn "*mulsi3_1"
|
||||
[(set (match_operand:SI 0 "register_operand" "=r,r,r")
|
||||
(mult:SI (match_operand:SI 1 "nonimmediate_operand" "%rm,rm,0")
|
||||
@ -6963,6 +7018,11 @@
|
||||
(match_operand 1 "memory_operand" ""))
|
||||
(const_string "vector")]
|
||||
(const_string "direct")))
|
||||
(set (attr "amdfam10_decode")
|
||||
(cond [(and (eq_attr "alternative" "0,1")
|
||||
(match_operand 1 "memory_operand" ""))
|
||||
(const_string "vector")]
|
||||
(const_string "direct")))
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_insn "*mulsi3_1_zext"
|
||||
@ -6988,6 +7048,11 @@
|
||||
(match_operand 1 "memory_operand" ""))
|
||||
(const_string "vector")]
|
||||
(const_string "direct")))
|
||||
(set (attr "amdfam10_decode")
|
||||
(cond [(and (eq_attr "alternative" "0,1")
|
||||
(match_operand 1 "memory_operand" ""))
|
||||
(const_string "vector")]
|
||||
(const_string "direct")))
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_expand "mulhi3"
|
||||
@ -6998,6 +7063,13 @@
|
||||
"TARGET_HIMODE_MATH"
|
||||
"")
|
||||
|
||||
;; On AMDFAM10
|
||||
;; IMUL reg16, reg16, imm8 VectorPath
|
||||
;; IMUL reg16, mem16, imm8 VectorPath
|
||||
;; IMUL reg16, reg16, imm16 VectorPath
|
||||
;; IMUL reg16, mem16, imm16 VectorPath
|
||||
;; IMUL reg16, reg16 Direct
|
||||
;; IMUL reg16, mem16 Direct
|
||||
(define_insn "*mulhi3_1"
|
||||
[(set (match_operand:HI 0 "register_operand" "=r,r,r")
|
||||
(mult:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,rm,0")
|
||||
@ -7016,6 +7088,10 @@
|
||||
(eq_attr "alternative" "1,2")
|
||||
(const_string "vector")]
|
||||
(const_string "direct")))
|
||||
(set (attr "amdfam10_decode")
|
||||
(cond [(eq_attr "alternative" "0,1")
|
||||
(const_string "vector")]
|
||||
(const_string "direct")))
|
||||
(set_attr "mode" "HI")])
|
||||
|
||||
(define_expand "mulqi3"
|
||||
@ -7026,6 +7102,10 @@
|
||||
"TARGET_QIMODE_MATH"
|
||||
"")
|
||||
|
||||
;;On AMDFAM10
|
||||
;; MUL reg8 Direct
|
||||
;; MUL mem8 Direct
|
||||
|
||||
(define_insn "*mulqi3_1"
|
||||
[(set (match_operand:QI 0 "register_operand" "=a")
|
||||
(mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0")
|
||||
@ -7040,6 +7120,7 @@
|
||||
(if_then_else (eq_attr "cpu" "athlon")
|
||||
(const_string "vector")
|
||||
(const_string "direct")))
|
||||
(set_attr "amdfam10_decode" "direct")
|
||||
(set_attr "mode" "QI")])
|
||||
|
||||
(define_expand "umulqihi3"
|
||||
@ -7066,6 +7147,7 @@
|
||||
(if_then_else (eq_attr "cpu" "athlon")
|
||||
(const_string "vector")
|
||||
(const_string "direct")))
|
||||
(set_attr "amdfam10_decode" "direct")
|
||||
(set_attr "mode" "QI")])
|
||||
|
||||
(define_expand "mulqihi3"
|
||||
@ -7090,6 +7172,7 @@
|
||||
(if_then_else (eq_attr "cpu" "athlon")
|
||||
(const_string "vector")
|
||||
(const_string "direct")))
|
||||
(set_attr "amdfam10_decode" "direct")
|
||||
(set_attr "mode" "QI")])
|
||||
|
||||
(define_expand "umulditi3"
|
||||
@ -7116,6 +7199,7 @@
|
||||
(if_then_else (eq_attr "cpu" "athlon")
|
||||
(const_string "vector")
|
||||
(const_string "double")))
|
||||
(set_attr "amdfam10_decode" "double")
|
||||
(set_attr "mode" "DI")])
|
||||
|
||||
;; We can't use this pattern in 64bit mode, since it results in two separate 32bit registers
|
||||
@ -7143,6 +7227,7 @@
|
||||
(if_then_else (eq_attr "cpu" "athlon")
|
||||
(const_string "vector")
|
||||
(const_string "double")))
|
||||
(set_attr "amdfam10_decode" "double")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_expand "mulditi3"
|
||||
@ -7169,6 +7254,7 @@
|
||||
(if_then_else (eq_attr "cpu" "athlon")
|
||||
(const_string "vector")
|
||||
(const_string "double")))
|
||||
(set_attr "amdfam10_decode" "double")
|
||||
(set_attr "mode" "DI")])
|
||||
|
||||
(define_expand "mulsidi3"
|
||||
@ -7195,6 +7281,7 @@
|
||||
(if_then_else (eq_attr "cpu" "athlon")
|
||||
(const_string "vector")
|
||||
(const_string "double")))
|
||||
(set_attr "amdfam10_decode" "double")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_expand "umuldi3_highpart"
|
||||
@ -7231,6 +7318,7 @@
|
||||
(if_then_else (eq_attr "cpu" "athlon")
|
||||
(const_string "vector")
|
||||
(const_string "double")))
|
||||
(set_attr "amdfam10_decode" "double")
|
||||
(set_attr "mode" "DI")])
|
||||
|
||||
(define_expand "umulsi3_highpart"
|
||||
@ -7266,6 +7354,7 @@
|
||||
(if_then_else (eq_attr "cpu" "athlon")
|
||||
(const_string "vector")
|
||||
(const_string "double")))
|
||||
(set_attr "amdfam10_decode" "double")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_insn "*umulsi3_highpart_zext"
|
||||
@ -7288,6 +7377,7 @@
|
||||
(if_then_else (eq_attr "cpu" "athlon")
|
||||
(const_string "vector")
|
||||
(const_string "double")))
|
||||
(set_attr "amdfam10_decode" "double")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_expand "smuldi3_highpart"
|
||||
@ -7323,6 +7413,7 @@
|
||||
(if_then_else (eq_attr "cpu" "athlon")
|
||||
(const_string "vector")
|
||||
(const_string "double")))
|
||||
(set_attr "amdfam10_decode" "double")
|
||||
(set_attr "mode" "DI")])
|
||||
|
||||
(define_expand "smulsi3_highpart"
|
||||
@ -7357,6 +7448,7 @@
|
||||
(if_then_else (eq_attr "cpu" "athlon")
|
||||
(const_string "vector")
|
||||
(const_string "double")))
|
||||
(set_attr "amdfam10_decode" "double")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_insn "*smulsi3_highpart_zext"
|
||||
@ -7378,6 +7470,7 @@
|
||||
(if_then_else (eq_attr "cpu" "athlon")
|
||||
(const_string "vector")
|
||||
(const_string "double")))
|
||||
(set_attr "amdfam10_decode" "double")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
;; The patterns that match these are at the end of this file.
|
||||
@ -10359,7 +10452,8 @@
|
||||
[(set_attr "type" "ishift")
|
||||
(set_attr "prefix_0f" "1")
|
||||
(set_attr "mode" "DI")
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "vector")])
|
||||
|
||||
(define_expand "x86_64_shift_adj"
|
||||
[(set (reg:CCZ FLAGS_REG)
|
||||
@ -10574,7 +10668,8 @@
|
||||
(set_attr "prefix_0f" "1")
|
||||
(set_attr "mode" "SI")
|
||||
(set_attr "pent_pair" "np")
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "vector")])
|
||||
|
||||
(define_expand "x86_shift_adj_1"
|
||||
[(set (reg:CCZ FLAGS_REG)
|
||||
@ -11334,7 +11429,8 @@
|
||||
[(set_attr "type" "ishift")
|
||||
(set_attr "prefix_0f" "1")
|
||||
(set_attr "mode" "DI")
|
||||
(set_attr "athlon_decode" "vector")])
|
||||
(set_attr "athlon_decode" "vector")
|
||||
(set_attr "amdfam10_decode" "vector")])
|
||||
|
||||
(define_expand "ashrdi3"
|
||||
[(set (match_operand:DI 0 "shiftdi_operand" "")
|
||||
@ -14608,7 +14704,23 @@
|
||||
[(set (match_dup 0) (xor:SI (match_dup 0) (const_int 31)))
|
||||
(clobber (reg:CC FLAGS_REG))])]
|
||||
""
|
||||
"")
|
||||
{
|
||||
if (TARGET_ABM)
|
||||
{
|
||||
emit_insn (gen_clzsi2_abm (operands[0], operands[1]));
|
||||
DONE;
|
||||
}
|
||||
})
|
||||
|
||||
(define_insn "clzsi2_abm"
|
||||
[(set (match_operand:SI 0 "register_operand" "=r")
|
||||
(clz:SI (match_operand:SI 1 "nonimmediate_operand" "")))
|
||||
(clobber (reg:CC FLAGS_REG))]
|
||||
"TARGET_ABM"
|
||||
"lzcnt{l}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "prefix_rep" "1")
|
||||
(set_attr "type" "bitmanip")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_insn "*bsr"
|
||||
[(set (match_operand:SI 0 "register_operand" "=r")
|
||||
@ -14617,7 +14729,44 @@
|
||||
(clobber (reg:CC FLAGS_REG))]
|
||||
""
|
||||
"bsr{l}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "prefix_0f" "1")])
|
||||
[(set_attr "prefix_0f" "1")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_insn "popcountsi2"
|
||||
[(set (match_operand:SI 0 "register_operand" "=r")
|
||||
(popcount:SI (match_operand:SI 1 "nonimmediate_operand" "")))
|
||||
(clobber (reg:CC FLAGS_REG))]
|
||||
"TARGET_POPCNT"
|
||||
"popcnt{l}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "prefix_rep" "1")
|
||||
(set_attr "type" "bitmanip")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_insn "*popcountsi2_cmp"
|
||||
[(set (reg FLAGS_REG)
|
||||
(compare
|
||||
(popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))
|
||||
(const_int 0)))
|
||||
(set (match_operand:SI 0 "register_operand" "=r")
|
||||
(popcount:SI (match_dup 1)))]
|
||||
"TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)"
|
||||
"popcnt{l}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "prefix_rep" "1")
|
||||
(set_attr "type" "bitmanip")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_insn "*popcountsi2_cmp_zext"
|
||||
[(set (reg FLAGS_REG)
|
||||
(compare
|
||||
(popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))
|
||||
(const_int 0)))
|
||||
(set (match_operand:DI 0 "register_operand" "=r")
|
||||
(zero_extend:DI(popcount:SI (match_dup 1))))]
|
||||
"TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)"
|
||||
"popcnt{l}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "prefix_rep" "1")
|
||||
(set_attr "type" "bitmanip")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_insn "bswapsi2"
|
||||
[(set (match_operand:SI 0 "register_operand" "=r")
|
||||
@ -14647,7 +14796,23 @@
|
||||
[(set (match_dup 0) (xor:DI (match_dup 0) (const_int 63)))
|
||||
(clobber (reg:CC FLAGS_REG))])]
|
||||
"TARGET_64BIT"
|
||||
"")
|
||||
{
|
||||
if (TARGET_ABM)
|
||||
{
|
||||
emit_insn (gen_clzdi2_abm (operands[0], operands[1]));
|
||||
DONE;
|
||||
}
|
||||
})
|
||||
|
||||
(define_insn "clzdi2_abm"
|
||||
[(set (match_operand:DI 0 "register_operand" "=r")
|
||||
(clz:DI (match_operand:DI 1 "nonimmediate_operand" "")))
|
||||
(clobber (reg:CC FLAGS_REG))]
|
||||
"TARGET_64BIT && TARGET_ABM"
|
||||
"lzcnt{q}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "prefix_rep" "1")
|
||||
(set_attr "type" "bitmanip")
|
||||
(set_attr "mode" "DI")])
|
||||
|
||||
(define_insn "*bsr_rex64"
|
||||
[(set (match_operand:DI 0 "register_operand" "=r")
|
||||
@ -14656,7 +14821,92 @@
|
||||
(clobber (reg:CC FLAGS_REG))]
|
||||
"TARGET_64BIT"
|
||||
"bsr{q}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "prefix_0f" "1")])
|
||||
[(set_attr "prefix_0f" "1")
|
||||
(set_attr "mode" "DI")])
|
||||
|
||||
(define_insn "popcountdi2"
|
||||
[(set (match_operand:DI 0 "register_operand" "=r")
|
||||
(popcount:DI (match_operand:DI 1 "nonimmediate_operand" "")))
|
||||
(clobber (reg:CC FLAGS_REG))]
|
||||
"TARGET_64BIT && TARGET_POPCNT"
|
||||
"popcnt{q}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "prefix_rep" "1")
|
||||
(set_attr "type" "bitmanip")
|
||||
(set_attr "mode" "DI")])
|
||||
|
||||
(define_insn "*popcountdi2_cmp"
|
||||
[(set (reg FLAGS_REG)
|
||||
(compare
|
||||
(popcount:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))
|
||||
(const_int 0)))
|
||||
(set (match_operand:DI 0 "register_operand" "=r")
|
||||
(popcount:DI (match_dup 1)))]
|
||||
"TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)"
|
||||
"popcnt{q}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "prefix_rep" "1")
|
||||
(set_attr "type" "bitmanip")
|
||||
(set_attr "mode" "DI")])
|
||||
|
||||
(define_expand "clzhi2"
|
||||
[(parallel
|
||||
[(set (match_operand:HI 0 "register_operand" "")
|
||||
(minus:HI (const_int 15)
|
||||
(clz:HI (match_operand:HI 1 "nonimmediate_operand" ""))))
|
||||
(clobber (reg:CC FLAGS_REG))])
|
||||
(parallel
|
||||
[(set (match_dup 0) (xor:HI (match_dup 0) (const_int 15)))
|
||||
(clobber (reg:CC FLAGS_REG))])]
|
||||
""
|
||||
{
|
||||
if (TARGET_ABM)
|
||||
{
|
||||
emit_insn (gen_clzhi2_abm (operands[0], operands[1]));
|
||||
DONE;
|
||||
}
|
||||
})
|
||||
|
||||
(define_insn "clzhi2_abm"
|
||||
[(set (match_operand:HI 0 "register_operand" "=r")
|
||||
(clz:HI (match_operand:HI 1 "nonimmediate_operand" "")))
|
||||
(clobber (reg:CC FLAGS_REG))]
|
||||
"TARGET_ABM"
|
||||
"lzcnt{w}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "prefix_rep" "1")
|
||||
(set_attr "type" "bitmanip")
|
||||
(set_attr "mode" "HI")])
|
||||
|
||||
(define_insn "*bsrhi"
|
||||
[(set (match_operand:HI 0 "register_operand" "=r")
|
||||
(minus:HI (const_int 15)
|
||||
(clz:HI (match_operand:HI 1 "nonimmediate_operand" "rm"))))
|
||||
(clobber (reg:CC FLAGS_REG))]
|
||||
""
|
||||
"bsr{w}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "prefix_0f" "1")
|
||||
(set_attr "mode" "HI")])
|
||||
|
||||
(define_insn "popcounthi2"
|
||||
[(set (match_operand:HI 0 "register_operand" "=r")
|
||||
(popcount:HI (match_operand:HI 1 "nonimmediate_operand" "")))
|
||||
(clobber (reg:CC FLAGS_REG))]
|
||||
"TARGET_POPCNT"
|
||||
"popcnt{w}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "prefix_rep" "1")
|
||||
(set_attr "type" "bitmanip")
|
||||
(set_attr "mode" "HI")])
|
||||
|
||||
(define_insn "*popcounthi2_cmp"
|
||||
[(set (reg FLAGS_REG)
|
||||
(compare
|
||||
(popcount:HI (match_operand:HI 1 "nonimmediate_operand" "rm"))
|
||||
(const_int 0)))
|
||||
(set (match_operand:HI 0 "register_operand" "=r")
|
||||
(popcount:HI (match_dup 1)))]
|
||||
"TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)"
|
||||
"popcnt{w}\t{%1, %0|%0, %1}"
|
||||
[(set_attr "prefix_rep" "1")
|
||||
(set_attr "type" "bitmanip")
|
||||
(set_attr "mode" "HI")])
|
||||
|
||||
;; Thread-local storage patterns for ELF.
|
||||
;;
|
||||
@ -15564,7 +15814,8 @@
|
||||
"fsqrt"
|
||||
[(set_attr "type" "fpspc")
|
||||
(set_attr "mode" "XF")
|
||||
(set_attr "athlon_decode" "direct")])
|
||||
(set_attr "athlon_decode" "direct")
|
||||
(set_attr "amdfam10_decode" "direct")])
|
||||
|
||||
(define_insn "sqrt_extend<mode>xf2_i387"
|
||||
[(set (match_operand:XF 0 "register_operand" "=f")
|
||||
@ -15575,7 +15826,8 @@
|
||||
"fsqrt"
|
||||
[(set_attr "type" "fpspc")
|
||||
(set_attr "mode" "XF")
|
||||
(set_attr "athlon_decode" "direct")])
|
||||
(set_attr "athlon_decode" "direct")
|
||||
(set_attr "amdfam10_decode" "direct")])
|
||||
|
||||
(define_insn "*sqrt<mode>2_sse"
|
||||
[(set (match_operand:SSEMODEF 0 "register_operand" "=x")
|
||||
@ -15585,7 +15837,8 @@
|
||||
"sqrts<ssemodefsuffix>\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sse")
|
||||
(set_attr "mode" "<MODE>")
|
||||
(set_attr "athlon_decode" "*")])
|
||||
(set_attr "athlon_decode" "*")
|
||||
(set_attr "amdfam10_decode" "*")])
|
||||
|
||||
(define_expand "sqrt<mode>2"
|
||||
[(set (match_operand:X87MODEF12 0 "register_operand" "")
|
||||
@ -19995,7 +20248,7 @@
|
||||
(mult:DI (match_operand:DI 1 "memory_operand" "")
|
||||
(match_operand:DI 2 "immediate_operand" "")))
|
||||
(clobber (reg:CC FLAGS_REG))])]
|
||||
"(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
|
||||
"(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size
|
||||
&& !satisfies_constraint_K (operands[2])"
|
||||
[(set (match_dup 3) (match_dup 1))
|
||||
(parallel [(set (match_dup 0) (mult:DI (match_dup 3) (match_dup 2)))
|
||||
@ -20008,7 +20261,7 @@
|
||||
(mult:SI (match_operand:SI 1 "memory_operand" "")
|
||||
(match_operand:SI 2 "immediate_operand" "")))
|
||||
(clobber (reg:CC FLAGS_REG))])]
|
||||
"(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
|
||||
"(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size
|
||||
&& !satisfies_constraint_K (operands[2])"
|
||||
[(set (match_dup 3) (match_dup 1))
|
||||
(parallel [(set (match_dup 0) (mult:SI (match_dup 3) (match_dup 2)))
|
||||
@ -20022,7 +20275,7 @@
|
||||
(mult:SI (match_operand:SI 1 "memory_operand" "")
|
||||
(match_operand:SI 2 "immediate_operand" ""))))
|
||||
(clobber (reg:CC FLAGS_REG))])]
|
||||
"(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
|
||||
"(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size
|
||||
&& !satisfies_constraint_K (operands[2])"
|
||||
[(set (match_dup 3) (match_dup 1))
|
||||
(parallel [(set (match_dup 0) (zero_extend:DI (mult:SI (match_dup 3) (match_dup 2))))
|
||||
@ -20039,7 +20292,7 @@
|
||||
(match_operand:DI 2 "const_int_operand" "")))
|
||||
(clobber (reg:CC FLAGS_REG))])
|
||||
(match_scratch:DI 3 "r")]
|
||||
"(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
|
||||
"(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size
|
||||
&& satisfies_constraint_K (operands[2])"
|
||||
[(set (match_dup 3) (match_dup 2))
|
||||
(parallel [(set (match_dup 0) (mult:DI (match_dup 0) (match_dup 3)))
|
||||
@ -20055,7 +20308,7 @@
|
||||
(match_operand:SI 2 "const_int_operand" "")))
|
||||
(clobber (reg:CC FLAGS_REG))])
|
||||
(match_scratch:SI 3 "r")]
|
||||
"(TARGET_K8 || TARGET_GENERIC64) && !optimize_size
|
||||
"(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size
|
||||
&& satisfies_constraint_K (operands[2])"
|
||||
[(set (match_dup 3) (match_dup 2))
|
||||
(parallel [(set (match_dup 0) (mult:SI (match_dup 0) (match_dup 3)))
|
||||
@ -20071,7 +20324,7 @@
|
||||
(match_operand:HI 2 "immediate_operand" "")))
|
||||
(clobber (reg:CC FLAGS_REG))])
|
||||
(match_scratch:HI 3 "r")]
|
||||
"(TARGET_K8 || TARGET_GENERIC64) && !optimize_size"
|
||||
"(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size"
|
||||
[(set (match_dup 3) (match_dup 2))
|
||||
(parallel [(set (match_dup 0) (mult:HI (match_dup 0) (match_dup 3)))
|
||||
(clobber (reg:CC FLAGS_REG))])]
|
||||
|
@ -1,6 +1,6 @@
|
||||
; Options for the IA-32 and AMD64 ports of the compiler.
|
||||
|
||||
; Copyright (C) 2005 Free Software Foundation, Inc.
|
||||
; Copyright (C) 2005, 2006, 2007 Free Software Foundation, Inc.
|
||||
;
|
||||
; This file is part of GCC.
|
||||
;
|
||||
@ -205,6 +205,22 @@ mssse3
|
||||
Target Report Mask(SSSE3)
|
||||
Support MMX, SSE, SSE2, SSE3 and SSSE3 built-in functions and code generation
|
||||
|
||||
msse4a
|
||||
Target Report Mask(SSE4A)
|
||||
Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation
|
||||
|
||||
mpopcnt
|
||||
Target Report Mask(POPCNT)
|
||||
Support code generation of popcount instruction for popcount built-ins
|
||||
namely __builtin_popcount, __builtin_popcountl and __builtin_popcountll
|
||||
|
||||
mabm
|
||||
Target Report Mask(ABM)
|
||||
Support code generation of Advanced Bit Manipulation (ABM) instructions,
|
||||
which include popcnt and lzcnt instructions, for popcount and clz built-ins
|
||||
namely __builtin_popcount, __builtin_popcountl, __builtin_popcountll and
|
||||
__builtin_clz, __builtin_clzl, __builtin_clzll
|
||||
|
||||
msseregparm
|
||||
Target RejectNegative Mask(SSEREGPARM)
|
||||
Use SSE register passing conventions for SF and DF mode
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
|
||||
/* Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GCC.
|
||||
|
||||
@ -30,7 +30,11 @@
|
||||
#ifndef _PMMINTRIN_H_INCLUDED
|
||||
#define _PMMINTRIN_H_INCLUDED
|
||||
|
||||
#ifdef __SSE3__
|
||||
#ifndef __SSE3__
|
||||
# error "SSE3 instruction set not enabled"
|
||||
#else
|
||||
|
||||
/* We need definitions from the SSE2 and SSE header files*/
|
||||
#include <xmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
;; GCC machine description for SSE instructions
|
||||
;; Copyright (C) 2005, 2006
|
||||
;; Copyright (C) 2005, 2006, 2007
|
||||
;; Free Software Foundation, Inc.
|
||||
;;
|
||||
;; This file is part of GCC.
|
||||
@ -956,6 +956,7 @@
|
||||
"cvtsi2ss\t{%2, %0|%0, %2}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "athlon_decode" "vector,double")
|
||||
(set_attr "amdfam10_decode" "vector,double")
|
||||
(set_attr "mode" "SF")])
|
||||
|
||||
(define_insn "sse_cvtsi2ssq"
|
||||
@ -969,6 +970,7 @@
|
||||
"cvtsi2ssq\t{%2, %0|%0, %2}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "athlon_decode" "vector,double")
|
||||
(set_attr "amdfam10_decode" "vector,double")
|
||||
(set_attr "mode" "SF")])
|
||||
|
||||
(define_insn "sse_cvtss2si"
|
||||
@ -992,6 +994,7 @@
|
||||
"cvtss2si\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "athlon_decode" "double,vector")
|
||||
(set_attr "amdfam10_decode" "double,double")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_insn "sse_cvtss2siq"
|
||||
@ -1015,6 +1018,7 @@
|
||||
"cvtss2siq\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "athlon_decode" "double,vector")
|
||||
(set_attr "amdfam10_decode" "double,double")
|
||||
(set_attr "mode" "DI")])
|
||||
|
||||
(define_insn "sse_cvttss2si"
|
||||
@ -1027,6 +1031,7 @@
|
||||
"cvttss2si\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "athlon_decode" "double,vector")
|
||||
(set_attr "amdfam10_decode" "double,double")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_insn "sse_cvttss2siq"
|
||||
@ -1039,6 +1044,7 @@
|
||||
"cvttss2siq\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "athlon_decode" "double,vector")
|
||||
(set_attr "amdfam10_decode" "double,double")
|
||||
(set_attr "mode" "DI")])
|
||||
|
||||
(define_insn "sse2_cvtdq2ps"
|
||||
@ -1944,7 +1950,8 @@
|
||||
"cvtsi2sd\t{%2, %0|%0, %2}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "mode" "DF")
|
||||
(set_attr "athlon_decode" "double,direct")])
|
||||
(set_attr "athlon_decode" "double,direct")
|
||||
(set_attr "amdfam10_decode" "vector,double")])
|
||||
|
||||
(define_insn "sse2_cvtsi2sdq"
|
||||
[(set (match_operand:V2DF 0 "register_operand" "=x,x")
|
||||
@ -1957,7 +1964,8 @@
|
||||
"cvtsi2sdq\t{%2, %0|%0, %2}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "mode" "DF")
|
||||
(set_attr "athlon_decode" "double,direct")])
|
||||
(set_attr "athlon_decode" "double,direct")
|
||||
(set_attr "amdfam10_decode" "vector,double")])
|
||||
|
||||
(define_insn "sse2_cvtsd2si"
|
||||
[(set (match_operand:SI 0 "register_operand" "=r,r")
|
||||
@ -1980,6 +1988,7 @@
|
||||
"cvtsd2si\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "athlon_decode" "double,vector")
|
||||
(set_attr "amdfam10_decode" "double,double")
|
||||
(set_attr "mode" "SI")])
|
||||
|
||||
(define_insn "sse2_cvtsd2siq"
|
||||
@ -2003,6 +2012,7 @@
|
||||
"cvtsd2siq\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "athlon_decode" "double,vector")
|
||||
(set_attr "amdfam10_decode" "double,double")
|
||||
(set_attr "mode" "DI")])
|
||||
|
||||
(define_insn "sse2_cvttsd2si"
|
||||
@ -2015,7 +2025,8 @@
|
||||
"cvttsd2si\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "mode" "SI")
|
||||
(set_attr "athlon_decode" "double,vector")])
|
||||
(set_attr "athlon_decode" "double,vector")
|
||||
(set_attr "amdfam10_decode" "double,double")])
|
||||
|
||||
(define_insn "sse2_cvttsd2siq"
|
||||
[(set (match_operand:DI 0 "register_operand" "=r,r")
|
||||
@ -2027,7 +2038,8 @@
|
||||
"cvttsd2siq\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "sseicvt")
|
||||
(set_attr "mode" "DI")
|
||||
(set_attr "athlon_decode" "double,vector")])
|
||||
(set_attr "athlon_decode" "double,vector")
|
||||
(set_attr "amdfam10_decode" "double,double")])
|
||||
|
||||
(define_insn "sse2_cvtdq2pd"
|
||||
[(set (match_operand:V2DF 0 "register_operand" "=x")
|
||||
@ -2058,7 +2070,8 @@
|
||||
"TARGET_SSE2"
|
||||
"cvtpd2dq\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "ssecvt")
|
||||
(set_attr "mode" "TI")])
|
||||
(set_attr "mode" "TI")
|
||||
(set_attr "amdfam10_decode" "double")])
|
||||
|
||||
(define_expand "sse2_cvttpd2dq"
|
||||
[(set (match_operand:V4SI 0 "register_operand" "")
|
||||
@ -2076,7 +2089,8 @@
|
||||
"TARGET_SSE2"
|
||||
"cvttpd2dq\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "ssecvt")
|
||||
(set_attr "mode" "TI")])
|
||||
(set_attr "mode" "TI")
|
||||
(set_attr "amdfam10_decode" "double")])
|
||||
|
||||
(define_insn "sse2_cvtsd2ss"
|
||||
[(set (match_operand:V4SF 0 "register_operand" "=x,x")
|
||||
@ -2090,20 +2104,22 @@
|
||||
"cvtsd2ss\t{%2, %0|%0, %2}"
|
||||
[(set_attr "type" "ssecvt")
|
||||
(set_attr "athlon_decode" "vector,double")
|
||||
(set_attr "amdfam10_decode" "vector,double")
|
||||
(set_attr "mode" "SF")])
|
||||
|
||||
(define_insn "sse2_cvtss2sd"
|
||||
[(set (match_operand:V2DF 0 "register_operand" "=x")
|
||||
[(set (match_operand:V2DF 0 "register_operand" "=x,x")
|
||||
(vec_merge:V2DF
|
||||
(float_extend:V2DF
|
||||
(vec_select:V2SF
|
||||
(match_operand:V4SF 2 "nonimmediate_operand" "xm")
|
||||
(match_operand:V4SF 2 "nonimmediate_operand" "x,m")
|
||||
(parallel [(const_int 0) (const_int 1)])))
|
||||
(match_operand:V2DF 1 "register_operand" "0")
|
||||
(match_operand:V2DF 1 "register_operand" "0,0")
|
||||
(const_int 1)))]
|
||||
"TARGET_SSE2"
|
||||
"cvtss2sd\t{%2, %0|%0, %2}"
|
||||
[(set_attr "type" "ssecvt")
|
||||
(set_attr "amdfam10_decode" "vector,double")
|
||||
(set_attr "mode" "DF")])
|
||||
|
||||
(define_expand "sse2_cvtpd2ps"
|
||||
@ -2124,7 +2140,8 @@
|
||||
"TARGET_SSE2"
|
||||
"cvtpd2ps\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "ssecvt")
|
||||
(set_attr "mode" "V4SF")])
|
||||
(set_attr "mode" "V4SF")
|
||||
(set_attr "amdfam10_decode" "double")])
|
||||
|
||||
(define_insn "sse2_cvtps2pd"
|
||||
[(set (match_operand:V2DF 0 "register_operand" "=x")
|
||||
@ -2135,7 +2152,8 @@
|
||||
"TARGET_SSE2"
|
||||
"cvtps2pd\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "ssecvt")
|
||||
(set_attr "mode" "V2DF")])
|
||||
(set_attr "mode" "V2DF")
|
||||
(set_attr "amdfam10_decode" "direct")])
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;
|
||||
@ -5146,3 +5164,92 @@
|
||||
"pabs<mmxvecsize>\t{%1, %0|%0, %1}";
|
||||
[(set_attr "type" "sselog1")
|
||||
(set_attr "mode" "DI")])
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;
|
||||
;; AMD SSE4A instructions
|
||||
;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(define_insn "sse4a_vmmovntv2df"
|
||||
[(set (match_operand:DF 0 "memory_operand" "=m")
|
||||
(unspec:DF [(vec_select:DF
|
||||
(match_operand:V2DF 1 "register_operand" "x")
|
||||
(parallel [(const_int 0)]))]
|
||||
UNSPEC_MOVNT))]
|
||||
"TARGET_SSE4A"
|
||||
"movntsd\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "ssemov")
|
||||
(set_attr "mode" "DF")])
|
||||
|
||||
(define_insn "sse4a_movntdf"
|
||||
[(set (match_operand:DF 0 "memory_operand" "=m")
|
||||
(unspec:DF [(match_operand:DF 1 "register_operand" "x")]
|
||||
UNSPEC_MOVNT))]
|
||||
"TARGET_SSE4A"
|
||||
"movntsd\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "ssemov")
|
||||
(set_attr "mode" "DF")])
|
||||
|
||||
(define_insn "sse4a_vmmovntv4sf"
|
||||
[(set (match_operand:SF 0 "memory_operand" "=m")
|
||||
(unspec:SF [(vec_select:SF
|
||||
(match_operand:V4SF 1 "register_operand" "x")
|
||||
(parallel [(const_int 0)]))]
|
||||
UNSPEC_MOVNT))]
|
||||
"TARGET_SSE4A"
|
||||
"movntss\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "ssemov")
|
||||
(set_attr "mode" "SF")])
|
||||
|
||||
(define_insn "sse4a_movntsf"
|
||||
[(set (match_operand:SF 0 "memory_operand" "=m")
|
||||
(unspec:SF [(match_operand:SF 1 "register_operand" "x")]
|
||||
UNSPEC_MOVNT))]
|
||||
"TARGET_SSE4A"
|
||||
"movntss\t{%1, %0|%0, %1}"
|
||||
[(set_attr "type" "ssemov")
|
||||
(set_attr "mode" "SF")])
|
||||
|
||||
(define_insn "sse4a_extrqi"
|
||||
[(set (match_operand:V2DI 0 "register_operand" "=x")
|
||||
(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
|
||||
(match_operand 2 "const_int_operand" "")
|
||||
(match_operand 3 "const_int_operand" "")]
|
||||
UNSPEC_EXTRQI))]
|
||||
"TARGET_SSE4A"
|
||||
"extrq\t{%3, %2, %0|%0, %2, %3}"
|
||||
[(set_attr "type" "sse")
|
||||
(set_attr "mode" "TI")])
|
||||
|
||||
(define_insn "sse4a_extrq"
|
||||
[(set (match_operand:V2DI 0 "register_operand" "=x")
|
||||
(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
|
||||
(match_operand:V16QI 2 "register_operand" "x")]
|
||||
UNSPEC_EXTRQ))]
|
||||
"TARGET_SSE4A"
|
||||
"extrq\t{%2, %0|%0, %2}"
|
||||
[(set_attr "type" "sse")
|
||||
(set_attr "mode" "TI")])
|
||||
|
||||
(define_insn "sse4a_insertqi"
|
||||
[(set (match_operand:V2DI 0 "register_operand" "=x")
|
||||
(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
|
||||
(match_operand:V2DI 2 "register_operand" "x")
|
||||
(match_operand 3 "const_int_operand" "")
|
||||
(match_operand 4 "const_int_operand" "")]
|
||||
UNSPEC_INSERTQI))]
|
||||
"TARGET_SSE4A"
|
||||
"insertq\t{%4, %3, %2, %0|%0, %2, %3, %4}"
|
||||
[(set_attr "type" "sseins")
|
||||
(set_attr "mode" "TI")])
|
||||
|
||||
(define_insn "sse4a_insertq"
|
||||
[(set (match_operand:V2DI 0 "register_operand" "=x")
|
||||
(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
|
||||
(match_operand:V2DI 2 "register_operand" "x")]
|
||||
UNSPEC_INSERTQ))]
|
||||
"TARGET_SSE4A"
|
||||
"insertq\t{%2, %0|%0, %2}"
|
||||
[(set_attr "type" "sseins")
|
||||
(set_attr "mode" "TI")])
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* Copyright (C) 2006 Free Software Foundation, Inc.
|
||||
/* Copyright (C) 2006, 2007 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GCC.
|
||||
|
||||
@ -30,7 +30,11 @@
|
||||
#ifndef _TMMINTRIN_H_INCLUDED
|
||||
#define _TMMINTRIN_H_INCLUDED
|
||||
|
||||
#ifdef __SSSE3__
|
||||
#ifndef __SSSE3__
|
||||
# error "SSSE3 instruction set not enabled"
|
||||
#else
|
||||
|
||||
/* We need definitions from the SSE3, SSE2 and SSE header files*/
|
||||
#include <pmmintrin.h>
|
||||
|
||||
static __inline __m128i __attribute__((__always_inline__))
|
||||
|
@ -7269,6 +7269,23 @@ v4si __builtin_ia32_pabsd128 (v4si)
|
||||
v8hi __builtin_ia32_pabsw128 (v8hi)
|
||||
@end smallexample
|
||||
|
||||
The following built-in functions are available when @option{-msse4a} is used.
|
||||
|
||||
@smallexample
|
||||
void _mm_stream_sd (double*,__m128d);
|
||||
Generates the @code{movntsd} machine instruction.
|
||||
void _mm_stream_ss (float*,__m128);
|
||||
Generates the @code{movntss} machine instruction.
|
||||
__m128i _mm_extract_si64 (__m128i, __m128i);
|
||||
Generates the @code{extrq} machine instruction with only SSE register operands.
|
||||
__m128i _mm_extracti_si64 (__m128i, int, int);
|
||||
Generates the @code{extrq} machine instruction with SSE register and immediate operands.
|
||||
__m128i _mm_insert_si64 (__m128i, __m128i);
|
||||
Generates the @code{insertq} machine instruction with only SSE register operands.
|
||||
__m128i _mm_inserti_si64 (__m128i, __m128i, int, int);
|
||||
Generates the @code{insertq} machine instruction with SSE register and immediate operands.
|
||||
@end smallexample
|
||||
|
||||
The following built-in functions are available when @option{-m3dnow} is used.
|
||||
All of them generate the machine instruction that is part of the name.
|
||||
|
||||
|
@ -538,7 +538,7 @@ Objective-C and Objective-C++ Dialects}.
|
||||
-mno-fp-ret-in-387 -msoft-float -msvr3-shlib @gol
|
||||
-mno-wide-multiply -mrtd -malign-double @gol
|
||||
-mpreferred-stack-boundary=@var{num} @gol
|
||||
-mmmx -msse -msse2 -msse3 -mssse3 -m3dnow @gol
|
||||
-mmmx -msse -msse2 -msse3 -mssse3 -msse4a -m3dnow -mpopcnt -mabm @gol
|
||||
-mthreads -mno-align-stringops -minline-all-stringops @gol
|
||||
-mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol
|
||||
-m96bit-long-double -mregparm=@var{num} -msseregparm @gol
|
||||
@ -9501,6 +9501,10 @@ instruction set support.
|
||||
@item k8, opteron, athlon64, athlon-fx
|
||||
AMD K8 core based CPUs with x86-64 instruction set support. (This supersets
|
||||
MMX, SSE, SSE2, 3dNOW!, enhanced 3dNOW! and 64-bit instruction set extensions.)
|
||||
@item amdfam10
|
||||
AMD Family 10 core based CPUs with x86-64 instruction set support. (This
|
||||
supersets MMX, SSE, SSE2, SSE3, SSE4A, 3dNOW!, enhanced 3dNOW!, ABM and 64-bit
|
||||
instruction set extensions.)
|
||||
@item winchip-c6
|
||||
IDT Winchip C6 CPU, dealt in same way as i486 with additional MMX instruction
|
||||
set support.
|
||||
@ -9795,8 +9799,14 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
|
||||
@itemx -mno-sse3
|
||||
@item -mssse3
|
||||
@itemx -mno-ssse3
|
||||
@item -msse4a
|
||||
@item -mno-sse4a
|
||||
@item -m3dnow
|
||||
@itemx -mno-3dnow
|
||||
@item -mpopcnt
|
||||
@itemx -mno-popcnt
|
||||
@item -mabm
|
||||
@itemx -mno-abm
|
||||
@opindex mmmx
|
||||
@opindex mno-mmx
|
||||
@opindex msse
|
||||
@ -9804,7 +9814,7 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
|
||||
@opindex m3dnow
|
||||
@opindex mno-3dnow
|
||||
These switches enable or disable the use of instructions in the MMX,
|
||||
SSE, SSE2, SSE3, SSSE3 or 3DNow! extended instruction sets.
|
||||
SSE, SSE2, SSE3, SSSE3, SSE4A, ABM or 3DNow! extended instruction sets.
|
||||
These extensions are also available as built-in functions: see
|
||||
@ref{X86 Built-in Functions}, for details of the functions enabled and
|
||||
disabled by these switches.
|
||||
|
@ -1,3 +1,12 @@
|
||||
2007-02-05 Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>
|
||||
|
||||
* gcc.dg/i386-cpuid.h: Test whether SSE4A is supported
|
||||
for running tests.
|
||||
* gcc.target/i386/sse4a-extract.c: New test.
|
||||
* gcc.target/i386/sse4a-insert.c: New test.
|
||||
* gcc.target/i386/sse4a-montsd.c: New test.
|
||||
* gcc.target/i386/sse4a-montss.c: New test.
|
||||
|
||||
2007-02-05 Richard Guenther <rguenther@suse.de>
|
||||
|
||||
* gcc.target/i386/vectorize3.c: New testcase.
|
||||
|
@ -12,6 +12,10 @@
|
||||
#define bit_SSE (1 << 25)
|
||||
#define bit_SSE2 (1 << 26)
|
||||
|
||||
/* Extended Features */
|
||||
/* %ecx */
|
||||
#define bit_SSE4a (1 << 6)
|
||||
|
||||
#ifndef NOINLINE
|
||||
#define NOINLINE __attribute__ ((noinline))
|
||||
#endif
|
||||
@ -60,8 +64,43 @@ i386_get_cpuid (unsigned int *ecx, unsigned int *edx)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline unsigned int
|
||||
i386_get_extended_cpuid (unsigned int *ecx, unsigned int *edx)
|
||||
{
|
||||
int fl1;
|
||||
if (!(i386_get_cpuid (ecx, edx)))
|
||||
return 0;
|
||||
|
||||
/* Invoke CPUID(0x80000000) to get the highest supported extended function
|
||||
number */
|
||||
#ifdef __x86_64__
|
||||
__asm__ ("cpuid"
|
||||
: "=a" (fl1) : "0" (0x80000000) : "edx", "ecx", "ebx");
|
||||
#else
|
||||
__asm__ ("pushl %%ebx; cpuid; popl %%ebx"
|
||||
: "=a" (fl1) : "0" (0x80000000) : "edx", "ecx");
|
||||
#endif
|
||||
/* Check if highest supported extended function used below are supported */
|
||||
if (fl1 < 0x80000001)
|
||||
return 0;
|
||||
|
||||
/* Invoke CPUID(0x80000001), return %ecx and %edx; caller can examine bits to
|
||||
determine what's supported. */
|
||||
#ifdef __x86_64__
|
||||
__asm__ ("cpuid"
|
||||
: "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (0x80000001) : "ebx");
|
||||
#else
|
||||
__asm__ ("pushl %%ebx; cpuid; popl %%ebx"
|
||||
: "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (0x80000001));
|
||||
#endif
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
unsigned int i386_cpuid_ecx (void) NOINLINE;
|
||||
unsigned int i386_cpuid_edx (void) NOINLINE;
|
||||
unsigned int i386_extended_cpuid_ecx (void) NOINLINE;
|
||||
unsigned int i386_extended_cpuid_edx (void) NOINLINE;
|
||||
|
||||
unsigned int NOINLINE
|
||||
i386_cpuid_ecx (void)
|
||||
@ -83,6 +122,26 @@ i386_cpuid_edx (void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned int NOINLINE
|
||||
i386_extended_cpuid_ecx (void)
|
||||
{
|
||||
unsigned int ecx, edx;
|
||||
if (i386_get_extended_cpuid (&ecx, &edx))
|
||||
return ecx;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned int NOINLINE
|
||||
i386_extended_cpuid_edx (void)
|
||||
{
|
||||
unsigned int ecx, edx;
|
||||
if (i386_get_extended_cpuid (&ecx, &edx))
|
||||
return edx;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned int
|
||||
i386_cpuid (void)
|
||||
{
|
||||
|
100
gcc/testsuite/gcc.target/i386/sse4a-extract.c
Normal file
100
gcc/testsuite/gcc.target/i386/sse4a-extract.c
Normal file
@ -0,0 +1,100 @@
|
||||
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
|
||||
/* { dg-options "-O2 -msse4a" } */
|
||||
#include <ammintrin.h>
|
||||
#include <stdlib.h>
|
||||
#include "../../gcc.dg/i386-cpuid.h"
|
||||
|
||||
static void sse4a_test (void);
|
||||
|
||||
typedef union
|
||||
{
|
||||
long long i[2];
|
||||
__m128i vec;
|
||||
} LI;
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
unsigned long cpu_facilities;
|
||||
|
||||
cpu_facilities = i386_extended_cpuid_ecx ();
|
||||
|
||||
/* Run SSE4a test only if host has SSE4a support. */
|
||||
if ((cpu_facilities & bit_SSE4a))
|
||||
sse4a_test ();
|
||||
|
||||
exit (0);
|
||||
}
|
||||
|
||||
static long long
|
||||
sse4a_test_extrq (long long in)
|
||||
{
|
||||
__m128i v1, v2;
|
||||
long long index_length, pad;
|
||||
LI v_out;
|
||||
index_length = 0x0000000000000810;
|
||||
pad = 0x0;
|
||||
v1 = _mm_set_epi64x (pad, in);
|
||||
v2 = _mm_set_epi64x (pad, index_length);
|
||||
v_out.vec = _mm_extract_si64 (v1, v2);
|
||||
return (v_out.i[0]);
|
||||
}
|
||||
|
||||
static long long
|
||||
sse4a_test_extrqi (long long in)
|
||||
{
|
||||
__m128i v1;
|
||||
long long pad =0x0;
|
||||
LI v_out;
|
||||
v1 = _mm_set_epi64x (pad, in);
|
||||
v_out.vec = _mm_extracti_si64 (v1, (unsigned int) 0x10,(unsigned int) 0x08);
|
||||
return (v_out.i[0]);
|
||||
}
|
||||
|
||||
static chk (long long i1, long long i2)
|
||||
{
|
||||
int n_fails =0;
|
||||
if (i1 != i2)
|
||||
n_fails +=1;
|
||||
return n_fails;
|
||||
}
|
||||
|
||||
long long vals_in[5] =
|
||||
{
|
||||
0x1234567887654321,
|
||||
0x1456782093002490,
|
||||
0x2340909123990390,
|
||||
0x9595959599595999,
|
||||
0x9099038798000029
|
||||
};
|
||||
|
||||
long long vals_out[5] =
|
||||
{
|
||||
0x0000000000006543,
|
||||
0x0000000000000024,
|
||||
0x0000000000009903,
|
||||
0x0000000000005959,
|
||||
0x0000000000000000
|
||||
};
|
||||
|
||||
static void
|
||||
sse4a_test (void)
|
||||
{
|
||||
int i;
|
||||
int fail = 0;
|
||||
long long out;
|
||||
|
||||
for (i = 0; i < 5; i += 1)
|
||||
{
|
||||
out = sse4a_test_extrq (vals_in[i]);
|
||||
fail += chk(out, vals_out[i]);
|
||||
|
||||
out = sse4a_test_extrqi (vals_in[i]);
|
||||
fail += chk(out, vals_out[i]);
|
||||
}
|
||||
|
||||
if (fail != 0)
|
||||
abort ();
|
||||
|
||||
exit (0);
|
||||
}
|
110
gcc/testsuite/gcc.target/i386/sse4a-insert.c
Normal file
110
gcc/testsuite/gcc.target/i386/sse4a-insert.c
Normal file
@ -0,0 +1,110 @@
|
||||
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
|
||||
/* { dg-options "-O2 -msse4a" } */
|
||||
#include <ammintrin.h>
|
||||
#include <stdlib.h>
|
||||
#include "../../gcc.dg/i386-cpuid.h"
|
||||
|
||||
static void sse4a_test (void);
|
||||
|
||||
typedef union
|
||||
{
|
||||
long long i[2];
|
||||
__m128i vec;
|
||||
} LI;
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
unsigned long cpu_facilities;
|
||||
|
||||
cpu_facilities = i386_extended_cpuid_ecx ();
|
||||
|
||||
/* Run SSE4a test only if host has SSE4a support. */
|
||||
if ((cpu_facilities & bit_SSE4a))
|
||||
sse4a_test ();
|
||||
|
||||
exit (0);
|
||||
}
|
||||
|
||||
static long long
|
||||
sse4a_test_insert (long long in1, long long in2)
|
||||
{
|
||||
__m128i v1,v2;
|
||||
long long index_length, pad;
|
||||
LI v_out;
|
||||
index_length = 0x0000000000000810;
|
||||
pad = 0x0;
|
||||
v1 = _mm_set_epi64x (pad, in1);
|
||||
v2 = _mm_set_epi64x (index_length, in2);
|
||||
v_out.vec = _mm_insert_si64 (v1, v2);
|
||||
return (v_out.i[0]);
|
||||
}
|
||||
|
||||
static long long
|
||||
sse4a_test_inserti (long long in1, long long in2)
|
||||
{
|
||||
__m128i v1,v2;
|
||||
long long pad = 0x0;
|
||||
LI v_out;
|
||||
v1 = _mm_set_epi64x (pad, in1);
|
||||
v2 = _mm_set_epi64x (pad, in2);
|
||||
v_out.vec = _mm_inserti_si64 (v1, v2, (unsigned int) 0x10, (unsigned int) 0x08);
|
||||
return (v_out.i[0]);
|
||||
}
|
||||
|
||||
static chk (long long i1, long long i2)
|
||||
{
|
||||
int n_fails =0;
|
||||
if (i1 != i2)
|
||||
n_fails +=1;
|
||||
return n_fails;
|
||||
}
|
||||
|
||||
long long vals_in1[5] =
|
||||
{
|
||||
0x1234567887654321,
|
||||
0x1456782093002490,
|
||||
0x2340909123990390,
|
||||
0x9595959599595999,
|
||||
0x9099038798000029
|
||||
};
|
||||
|
||||
long long vals_in2[5] =
|
||||
{
|
||||
0x9ABCDEF00FEDCBA9,
|
||||
0x234567097289672A,
|
||||
0x45476453097BD342,
|
||||
0x23569012AE586FF0,
|
||||
0x432567ABCDEF765D
|
||||
};
|
||||
|
||||
long long vals_out[5] =
|
||||
{
|
||||
0x1234567887CBA921,
|
||||
0x1456782093672A90,
|
||||
0x2340909123D34290,
|
||||
0x95959595996FF099,
|
||||
0x9099038798765D29
|
||||
};
|
||||
|
||||
static void
|
||||
sse4a_test (void)
|
||||
{
|
||||
int i;
|
||||
int fail = 0;
|
||||
long long out;
|
||||
|
||||
for (i = 0; i < 5; i += 1)
|
||||
{
|
||||
out = sse4a_test_insert (vals_in1[i], vals_in2[i]);
|
||||
fail += chk(out, vals_out[i]);
|
||||
|
||||
out = sse4a_test_inserti (vals_in1[i], vals_in2[i]);
|
||||
fail += chk(out, vals_out[i]);
|
||||
}
|
||||
|
||||
if (fail != 0)
|
||||
abort ();
|
||||
|
||||
exit (0);
|
||||
}
|
64
gcc/testsuite/gcc.target/i386/sse4a-montsd.c
Normal file
64
gcc/testsuite/gcc.target/i386/sse4a-montsd.c
Normal file
@ -0,0 +1,64 @@
|
||||
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
|
||||
/* { dg-options "-O2 -msse4a" } */
|
||||
#include <ammintrin.h>
|
||||
#include <stdlib.h>
|
||||
#include "../../gcc.dg/i386-cpuid.h"
|
||||
|
||||
static void sse4a_test (void);
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
unsigned long cpu_facilities;
|
||||
|
||||
cpu_facilities = i386_extended_cpuid_ecx ();
|
||||
|
||||
/* Run SSE4a test only if host has SSE4a support. */
|
||||
if ((cpu_facilities & bit_SSE4a))
|
||||
sse4a_test ();
|
||||
|
||||
exit (0);
|
||||
}
|
||||
|
||||
static void
|
||||
sse4a_test_movntsd (double *out, double *in)
|
||||
{
|
||||
__m128d in_v2df = _mm_load_sd (in);
|
||||
_mm_stream_sd (out, in_v2df);
|
||||
}
|
||||
|
||||
static int
|
||||
chk_sd (double *v1, double *v2)
|
||||
{
|
||||
int n_fails = 0;
|
||||
if (v1[0] != v2[0])
|
||||
n_fails += 1;
|
||||
return n_fails;
|
||||
}
|
||||
|
||||
double vals[10] =
|
||||
{
|
||||
100.0, 200.0, 300.0, 400.0, 5.0,
|
||||
-1.0, .345, -21.5, 9.32, 8.41
|
||||
};
|
||||
|
||||
static void
|
||||
sse4a_test (void)
|
||||
{
|
||||
int i;
|
||||
int fail = 0;
|
||||
double *out;
|
||||
|
||||
out = (double *) malloc (sizeof (double));
|
||||
for (i = 0; i < 10; i += 1)
|
||||
{
|
||||
sse4a_test_movntsd (out, &vals[i]);
|
||||
|
||||
fail += chk_sd (out, &vals[i]);
|
||||
}
|
||||
|
||||
if (fail != 0)
|
||||
abort ();
|
||||
|
||||
exit (0);
|
||||
}
|
64
gcc/testsuite/gcc.target/i386/sse4a-montss.c
Normal file
64
gcc/testsuite/gcc.target/i386/sse4a-montss.c
Normal file
@ -0,0 +1,64 @@
|
||||
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
|
||||
/* { dg-options "-O2 -msse4a" } */
|
||||
#include <ammintrin.h>
|
||||
#include <stdlib.h>
|
||||
#include "../../gcc.dg/i386-cpuid.h"
|
||||
|
||||
static void sse4a_test (void);
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
unsigned long cpu_facilities;
|
||||
|
||||
cpu_facilities = i386_extended_cpuid_ecx ();
|
||||
|
||||
/* Run SSE4a test only if host has SSE4a support. */
|
||||
if ((cpu_facilities & bit_SSE4a))
|
||||
sse4a_test ();
|
||||
|
||||
exit (0);
|
||||
}
|
||||
|
||||
static void
|
||||
sse4a_test_movntss (float *out, float *in)
|
||||
{
|
||||
__m128 in_v4sf = _mm_load_ss (in);
|
||||
_mm_stream_ss (out, in_v4sf);
|
||||
}
|
||||
|
||||
static int
|
||||
chk_ss (float *v1, float *v2)
|
||||
{
|
||||
int n_fails = 0;
|
||||
if (v1[0] != v2[0])
|
||||
n_fails += 1;
|
||||
return n_fails;
|
||||
}
|
||||
|
||||
float vals[10] =
|
||||
{
|
||||
100.0, 200.0, 300.0, 400.0, 5.0,
|
||||
-1.0, .345, -21.5, 9.32, 8.41
|
||||
};
|
||||
|
||||
static void
|
||||
sse4a_test (void)
|
||||
{
|
||||
int i;
|
||||
int fail = 0;
|
||||
float *out;
|
||||
|
||||
out = (float *) malloc (sizeof (float));
|
||||
for (i = 0; i < 10; i += 1)
|
||||
{
|
||||
sse4a_test_movntss (out, &vals[i]);
|
||||
|
||||
fail += chk_ss (out, &vals[i]);
|
||||
}
|
||||
|
||||
if (fail != 0)
|
||||
abort ();
|
||||
|
||||
exit (0);
|
||||
}
|
Loading…
Reference in New Issue
Block a user