From 9a5cee0228a8b8d639e29682c800fe251175ce62 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Tue, 22 May 2007 14:37:19 +0000 Subject: [PATCH] config.gcc (i[34567]86-*-*): Add smmintrin.h to extra_headers. 2007-05-22 H.J. Lu Richard Henderson * config.gcc (i[34567]86-*-*): Add smmintrin.h to extra_headers. (x86_64-*-*): Likewise. * i386/i386-modes.def (V2QI): New. * config/i386/i386.c (ix86_handle_option): Handle SSE4.1 and SSE4A. (override_options): Support SSE4.1. (IX86_BUILTIN_BLENDPD): New for SSE4.1. (IX86_BUILTIN_BLENDPS): Likewise. (IX86_BUILTIN_BLENDVPD): Likewise. (IX86_BUILTIN_BLENDVPS): Likewise. (IX86_BUILTIN_PBLENDVB128): Likewise. (IX86_BUILTIN_PBLENDW128): Likewise. (IX86_BUILTIN_DPPD): Likewise. (IX86_BUILTIN_DPPS): Likewise. (IX86_BUILTIN_INSERTPS128): Likewise. (IX86_BUILTIN_MOVNTDQA): Likewise. (IX86_BUILTIN_MPSADBW128): Likewise. (IX86_BUILTIN_PACKUSDW128): Likewise. (IX86_BUILTIN_PCMPEQQ): Likewise. (IX86_BUILTIN_PHMINPOSUW128): Likewise. (IX86_BUILTIN_PMAXSB128): Likewise. (IX86_BUILTIN_PMAXSD128): Likewise. (IX86_BUILTIN_PMAXUD128): Likewise. (IX86_BUILTIN_PMAXUW128): Likewise. (IX86_BUILTIN_PMINSB128): Likewise. (IX86_BUILTIN_PMINSD128): Likewise. (IX86_BUILTIN_PMINUD128): Likewise. (IX86_BUILTIN_PMINUW128): Likewise. (IX86_BUILTIN_PMOVSXBW128): Likewise. (IX86_BUILTIN_PMOVSXBD128): Likewise. (IX86_BUILTIN_PMOVSXBQ128): Likewise. (IX86_BUILTIN_PMOVSXWD128): Likewise. (IX86_BUILTIN_PMOVSXWQ128): Likewise. (IX86_BUILTIN_PMOVSXDQ128): Likewise. (IX86_BUILTIN_PMOVZXBW128): Likewise. (IX86_BUILTIN_PMOVZXBD128): Likewise. (IX86_BUILTIN_PMOVZXBQ128): Likewise. (IX86_BUILTIN_PMOVZXWD128): Likewise. (IX86_BUILTIN_PMOVZXWQ128): Likewise. (IX86_BUILTIN_PMOVZXDQ128): Likewise. (IX86_BUILTIN_PMULDQ128): Likewise. (IX86_BUILTIN_PMULLD128): Likewise. (IX86_BUILTIN_ROUNDPD): Likewise. (IX86_BUILTIN_ROUNDPS): Likewise. (IX86_BUILTIN_ROUNDSD): Likewise. (IX86_BUILTIN_ROUNDSS): Likewise. (IX86_BUILTIN_PTESTZ): Likewise. (IX86_BUILTIN_PTESTC): Likewise. (IX86_BUILTIN_PTESTNZC): Likewise. (IX86_BUILTIN_VEC_EXT_V16QI): Likewise. (IX86_BUILTIN_VEC_SET_V2DI): Likewise. (IX86_BUILTIN_VEC_SET_V4SF): Likewise. (IX86_BUILTIN_VEC_SET_V4SI): Likewise. (IX86_BUILTIN_VEC_SET_V16QI): Likewise. (bdesc_ptest): New. (bdesc_sse_3arg): Likewise. (bdesc_2arg): Likewise. (bdesc_1arg): Likewise. (ix86_init_mmx_sse_builtins): Support SSE4.1. Handle SSE builtins with 3 args. (ix86_expand_sse_4_operands_builtin): New. (ix86_expand_unop_builtin): Support 2 arg builtins with a constant smaller than 8 bits as the 2nd arg. (ix86_expand_sse_ptest): New. (ix86_expand_builtin): Support SSE4.1. Support 3 arg SSE builtins. (ix86_expand_vector_set): Support SSE4.1. (ix86_expand_vector_extract): Likewise. * config/i386/i386.h (TARGET_CPU_CPP_BUILTINS): Define __SSE4_1__ for -msse4.1. * config/i386/i386.md (UNSPEC_BLENDV): New for SSE4.1. (UNSPEC_INSERTPS): Likewise. (UNSPEC_DP): Likewise. (UNSPEC_MOVNTDQA): Likewise. (UNSPEC_MPSADBW): Likewise. (UNSPEC_PHMINPOSUW): Likewise. (UNSPEC_PTEST): Likewise. (UNSPEC_ROUNDP): Likewise. (UNSPEC_ROUNDS): Likewise. * config/i386/i386.opt (msse4.1): New for SSE4.1. * config/i386/predicates.md (const_pow2_1_to_2_operand): New. (const_pow2_1_to_32768_operand): Likewise. * config/i386/smmintrin.h: New. The SSE4.1 intrinsic header file. * config/i386/sse.md (*vec_setv4sf_sse4_1): New pattern for SSE4.1. (sse4_1_insertps): Likewise. (*sse4_1_extractps): Likewise. (sse4_1_ptest): Likewise. (sse4_1_mulv2siv2di3): Likewise. (*sse4_1_mulv4si3): Likewise. (*sse4_1_smax3): Likewise. (*sse4_1_umax3): Likewise. (*sse4_1_smin3): Likewise. (*sse4_1_umin3): Likewise. (sse4_1_eqv2di3): Likewise. (*sse4_1_pinsrb): Likewise. (*sse4_1_pinsrd): Likewise. (*sse4_1_pinsrq): Likewise. (*sse4_1_pextrb): Likewise. (*sse4_1_pextrb_memory): Likewise. (*sse4_1_pextrw_memory): Likewise. (*sse4_1_pextrq): Likewise. (sse4_1_blendpd): Likewise. (sse4_1_blendps): Likewise. (sse4_1_blendvpd): Likewise. (sse4_1_blendvps): Likewise. (sse4_1_dppd): Likewise. (sse4_1_dpps): Likewise. (sse4_1_movntdqa): Likewise. (sse4_1_mpsadbw): Likewise. (sse4_1_packusdw): Likewise. (sse4_1_pblendvb): Likewise. (sse4_1_pblendw): Likewise. (sse4_1_phminposuw): Likewise. (sse4_1_extendv8qiv8hi2): Likewise. (*sse4_1_extendv8qiv8hi2): Likewise. (sse4_1_extendv4qiv4si2): Likewise. (*sse4_1_extendv4qiv4si2): Likewise. (sse4_1_extendv2qiv2di2): Likewise. (*sse4_1_extendv2qiv2di2): Likewise. (sse4_1_extendv4hiv4si2): Likewise. (*sse4_1_extendv4hiv4si2): Likewise. (sse4_1_extendv2hiv2di2): Likewise. (*sse4_1_extendv2hiv2di2): Likewise. (sse4_1_extendv2siv2di2): Likewise. (*sse4_1_extendv2siv2di2): Likewise. (sse4_1_zero_extendv8qiv8hi2): Likewise. (*sse4_1_zero_extendv8qiv8hi2): Likewise. (sse4_1_zero_extendv4qiv4si2): Likewise. (*sse4_1_zero_extendv4qiv4si2): Likewise. (sse4_1_zero_extendv2qiv2di2): Likewise. (*sse4_1_zero_extendv2qiv2di2): Likewise. (sse4_1_zero_extendv4hiv4si2): Likewise. (*sse4_1_zero_extendv4hiv4si2): Likewise. (sse4_1_zero_extendv2hiv2di2): Likewise. (*sse4_1_zero_extendv2hiv2di2): Likewise. (sse4_1_zero_extendv2siv2di2): Likewise. (*sse4_1_zero_extendv2siv2di2): Likewise. (sse4_1_roundpd): Likewise. (sse4_1_roundps): Likewise. (sse4_1_roundsd): Likewise. (sse4_1_roundss): Likewise. (mulv4si3): Don't expand for SSE4.1. (smax3): Likewise. (umaxv4si3): Likewise. (uminv16qi3): Likewise. (umin3): Likewise. (umaxv8hi3): Rewrite. Only enabled for SSE4.1. * doc/extend.texi: Document SSE4.1 built-in functions. * doc/invoke.texi: Document -msse4.1. Co-Authored-By: Richard Henderson From-SVN: r124945 --- gcc/ChangeLog | 165 ++++++ gcc/config.gcc | 4 +- gcc/config/i386/i386-modes.def | 1 + gcc/config/i386/i386.c | 512 +++++++++++++++- gcc/config/i386/i386.h | 2 + gcc/config/i386/i386.md | 11 + gcc/config/i386/i386.opt | 4 + gcc/config/i386/predicates.md | 13 + gcc/config/i386/smmintrin.h | 578 ++++++++++++++++++ gcc/config/i386/sse.md | 1004 +++++++++++++++++++++++++++++--- gcc/doc/extend.texi | 78 +++ gcc/doc/invoke.texi | 8 +- 12 files changed, 2287 insertions(+), 93 deletions(-) create mode 100644 gcc/config/i386/smmintrin.h diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d390d085da5..160684d6423 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,168 @@ +2007-05-22 H.J. Lu + Richard Henderson + + * config.gcc (i[34567]86-*-*): Add smmintrin.h to + extra_headers. + (x86_64-*-*): Likewise. + + * i386/i386-modes.def (V2QI): New. + + * config/i386/i386.c (ix86_handle_option): Handle SSE4.1 and + SSE4A. + (override_options): Support SSE4.1. + (IX86_BUILTIN_BLENDPD): New for SSE4.1. + (IX86_BUILTIN_BLENDPS): Likewise. + (IX86_BUILTIN_BLENDVPD): Likewise. + (IX86_BUILTIN_BLENDVPS): Likewise. + (IX86_BUILTIN_PBLENDVB128): Likewise. + (IX86_BUILTIN_PBLENDW128): Likewise. + (IX86_BUILTIN_DPPD): Likewise. + (IX86_BUILTIN_DPPS): Likewise. + (IX86_BUILTIN_INSERTPS128): Likewise. + (IX86_BUILTIN_MOVNTDQA): Likewise. + (IX86_BUILTIN_MPSADBW128): Likewise. + (IX86_BUILTIN_PACKUSDW128): Likewise. + (IX86_BUILTIN_PCMPEQQ): Likewise. + (IX86_BUILTIN_PHMINPOSUW128): Likewise. + (IX86_BUILTIN_PMAXSB128): Likewise. + (IX86_BUILTIN_PMAXSD128): Likewise. + (IX86_BUILTIN_PMAXUD128): Likewise. + (IX86_BUILTIN_PMAXUW128): Likewise. + (IX86_BUILTIN_PMINSB128): Likewise. + (IX86_BUILTIN_PMINSD128): Likewise. + (IX86_BUILTIN_PMINUD128): Likewise. + (IX86_BUILTIN_PMINUW128): Likewise. + (IX86_BUILTIN_PMOVSXBW128): Likewise. + (IX86_BUILTIN_PMOVSXBD128): Likewise. + (IX86_BUILTIN_PMOVSXBQ128): Likewise. + (IX86_BUILTIN_PMOVSXWD128): Likewise. + (IX86_BUILTIN_PMOVSXWQ128): Likewise. + (IX86_BUILTIN_PMOVSXDQ128): Likewise. + (IX86_BUILTIN_PMOVZXBW128): Likewise. + (IX86_BUILTIN_PMOVZXBD128): Likewise. + (IX86_BUILTIN_PMOVZXBQ128): Likewise. + (IX86_BUILTIN_PMOVZXWD128): Likewise. + (IX86_BUILTIN_PMOVZXWQ128): Likewise. + (IX86_BUILTIN_PMOVZXDQ128): Likewise. + (IX86_BUILTIN_PMULDQ128): Likewise. + (IX86_BUILTIN_PMULLD128): Likewise. + (IX86_BUILTIN_ROUNDPD): Likewise. + (IX86_BUILTIN_ROUNDPS): Likewise. + (IX86_BUILTIN_ROUNDSD): Likewise. + (IX86_BUILTIN_ROUNDSS): Likewise. + (IX86_BUILTIN_PTESTZ): Likewise. + (IX86_BUILTIN_PTESTC): Likewise. + (IX86_BUILTIN_PTESTNZC): Likewise. + (IX86_BUILTIN_VEC_EXT_V16QI): Likewise. + (IX86_BUILTIN_VEC_SET_V2DI): Likewise. + (IX86_BUILTIN_VEC_SET_V4SF): Likewise. + (IX86_BUILTIN_VEC_SET_V4SI): Likewise. + (IX86_BUILTIN_VEC_SET_V16QI): Likewise. + (bdesc_ptest): New. + (bdesc_sse_3arg): Likewise. + (bdesc_2arg): Likewise. + (bdesc_1arg): Likewise. + (ix86_init_mmx_sse_builtins): Support SSE4.1. Handle SSE builtins + with 3 args. + (ix86_expand_sse_4_operands_builtin): New. + (ix86_expand_unop_builtin): Support 2 arg builtins with a constant + smaller than 8 bits as the 2nd arg. + (ix86_expand_sse_ptest): New. + (ix86_expand_builtin): Support SSE4.1. Support 3 arg SSE builtins. + (ix86_expand_vector_set): Support SSE4.1. + (ix86_expand_vector_extract): Likewise. + + * config/i386/i386.h (TARGET_CPU_CPP_BUILTINS): Define + __SSE4_1__ for -msse4.1. + + * config/i386/i386.md (UNSPEC_BLENDV): New for SSE4.1. + (UNSPEC_INSERTPS): Likewise. + (UNSPEC_DP): Likewise. + (UNSPEC_MOVNTDQA): Likewise. + (UNSPEC_MPSADBW): Likewise. + (UNSPEC_PHMINPOSUW): Likewise. + (UNSPEC_PTEST): Likewise. + (UNSPEC_ROUNDP): Likewise. + (UNSPEC_ROUNDS): Likewise. + + * config/i386/i386.opt (msse4.1): New for SSE4.1. + + * config/i386/predicates.md (const_pow2_1_to_2_operand): New. + (const_pow2_1_to_32768_operand): Likewise. + + * config/i386/smmintrin.h: New. The SSE4.1 intrinsic header + file. + + * config/i386/sse.md (*vec_setv4sf_sse4_1): New pattern for + SSE4.1. + (sse4_1_insertps): Likewise. + (*sse4_1_extractps): Likewise. + (sse4_1_ptest): Likewise. + (sse4_1_mulv2siv2di3): Likewise. + (*sse4_1_mulv4si3): Likewise. + (*sse4_1_smax3): Likewise. + (*sse4_1_umax3): Likewise. + (*sse4_1_smin3): Likewise. + (*sse4_1_umin3): Likewise. + (sse4_1_eqv2di3): Likewise. + (*sse4_1_pinsrb): Likewise. + (*sse4_1_pinsrd): Likewise. + (*sse4_1_pinsrq): Likewise. + (*sse4_1_pextrb): Likewise. + (*sse4_1_pextrb_memory): Likewise. + (*sse4_1_pextrw_memory): Likewise. + (*sse4_1_pextrq): Likewise. + (sse4_1_blendpd): Likewise. + (sse4_1_blendps): Likewise. + (sse4_1_blendvpd): Likewise. + (sse4_1_blendvps): Likewise. + (sse4_1_dppd): Likewise. + (sse4_1_dpps): Likewise. + (sse4_1_movntdqa): Likewise. + (sse4_1_mpsadbw): Likewise. + (sse4_1_packusdw): Likewise. + (sse4_1_pblendvb): Likewise. + (sse4_1_pblendw): Likewise. + (sse4_1_phminposuw): Likewise. + (sse4_1_extendv8qiv8hi2): Likewise. + (*sse4_1_extendv8qiv8hi2): Likewise. + (sse4_1_extendv4qiv4si2): Likewise. + (*sse4_1_extendv4qiv4si2): Likewise. + (sse4_1_extendv2qiv2di2): Likewise. + (*sse4_1_extendv2qiv2di2): Likewise. + (sse4_1_extendv4hiv4si2): Likewise. + (*sse4_1_extendv4hiv4si2): Likewise. + (sse4_1_extendv2hiv2di2): Likewise. + (*sse4_1_extendv2hiv2di2): Likewise. + (sse4_1_extendv2siv2di2): Likewise. + (*sse4_1_extendv2siv2di2): Likewise. + (sse4_1_zero_extendv8qiv8hi2): Likewise. + (*sse4_1_zero_extendv8qiv8hi2): Likewise. + (sse4_1_zero_extendv4qiv4si2): Likewise. + (*sse4_1_zero_extendv4qiv4si2): Likewise. + (sse4_1_zero_extendv2qiv2di2): Likewise. + (*sse4_1_zero_extendv2qiv2di2): Likewise. + (sse4_1_zero_extendv4hiv4si2): Likewise. + (*sse4_1_zero_extendv4hiv4si2): Likewise. + (sse4_1_zero_extendv2hiv2di2): Likewise. + (*sse4_1_zero_extendv2hiv2di2): Likewise. + (sse4_1_zero_extendv2siv2di2): Likewise. + (*sse4_1_zero_extendv2siv2di2): Likewise. + (sse4_1_roundpd): Likewise. + (sse4_1_roundps): Likewise. + (sse4_1_roundsd): Likewise. + (sse4_1_roundss): Likewise. + (mulv4si3): Don't expand for SSE4.1. + (smax3): Likewise. + (umaxv4si3): Likewise. + (uminv16qi3): Likewise. + (umin3): Likewise. + (umaxv8hi3): Rewrite. Only enabled for SSE4.1. + + * doc/extend.texi: Document SSE4.1 built-in functions. + + * doc/invoke.texi: Document -msse4.1. + 2007-05-22 Nathan Sidwell * config/m68k/linux.h (ASM_SPEC): Add asm_pcrel_spec. diff --git a/gcc/config.gcc b/gcc/config.gcc index bf1d0b1f137..fb346a0252a 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -276,12 +276,12 @@ xscale-*-*) i[34567]86-*-*) cpu_type=i386 extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h - pmmintrin.h tmmintrin.h ammintrin.h" + pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h" ;; x86_64-*-*) cpu_type=i386 extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h - pmmintrin.h tmmintrin.h ammintrin.h" + pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h" need_64bit_hwint=yes ;; ia64-*-*) diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def index 40c12b13c3e..2efccda0aec 100644 --- a/gcc/config/i386/i386-modes.def +++ b/gcc/config/i386/i386-modes.def @@ -68,6 +68,7 @@ VECTOR_MODES (INT, 8); /* V8QI V4HI V2SI */ VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI */ VECTOR_MODES (FLOAT, 8); /* V4HF V2SF */ VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */ +VECTOR_MODE (INT, QI, 2); /* V2QI */ VECTOR_MODE (INT, DI, 4); /* V4DI */ VECTOR_MODE (INT, SI, 8); /* V8SI */ VECTOR_MODE (INT, HI, 16); /* V16HI */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 13d0b9eba52..99491b29029 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1594,6 +1594,14 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_mssse3: + if (!value) + { + target_flags &= ~(MASK_SSE4_1 | MASK_SSE4A); + target_flags_explicit |= MASK_SSE4_1 | MASK_SSE4A; + } + return true; + + case OPT_msse4_1: if (!value) { target_flags &= ~MASK_SSE4A; @@ -1601,6 +1609,14 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) } return true; + case OPT_msse4a: + if (!value) + { + target_flags &= ~MASK_SSE4_1; + target_flags_explicit |= MASK_SSE4_1; + } + return true; + default: return true; } @@ -1674,7 +1690,8 @@ override_options (void) PTA_POPCNT = 1 << 10, PTA_ABM = 1 << 11, PTA_SSE4A = 1 << 12, - PTA_NO_SAHF = 1 << 13 + PTA_NO_SAHF = 1 << 13, + PTA_SSE4_1 = 1 << 14 } flags; } const processor_alias_table[] = @@ -1936,6 +1953,9 @@ override_options (void) if (processor_alias_table[i].flags & PTA_SSSE3 && !(target_flags_explicit & MASK_SSSE3)) target_flags |= MASK_SSSE3; + if (processor_alias_table[i].flags & PTA_SSE4_1 + && !(target_flags_explicit & MASK_SSE4_1)) + target_flags |= MASK_SSE4_1; if (processor_alias_table[i].flags & PTA_PREFETCH_SSE) x86_prefetch_sse = true; if (processor_alias_table[i].flags & PTA_CX16) @@ -2141,6 +2161,10 @@ override_options (void) if (!TARGET_80387) target_flags |= MASK_NO_FANCY_MATH_387; + /* Turn on SSSE3 builtins for -msse4.1. */ + if (TARGET_SSE4_1) + target_flags |= MASK_SSSE3; + /* Turn on SSE3 builtins for -mssse3. */ if (TARGET_SSSE3) target_flags |= MASK_SSE3; @@ -16412,6 +16436,61 @@ enum ix86_builtins IX86_BUILTIN_INSERTQI, IX86_BUILTIN_INSERTQ, + /* SSE4.1. */ + IX86_BUILTIN_BLENDPD, + IX86_BUILTIN_BLENDPS, + IX86_BUILTIN_BLENDVPD, + IX86_BUILTIN_BLENDVPS, + IX86_BUILTIN_PBLENDVB128, + IX86_BUILTIN_PBLENDW128, + + IX86_BUILTIN_DPPD, + IX86_BUILTIN_DPPS, + + IX86_BUILTIN_INSERTPS128, + + IX86_BUILTIN_MOVNTDQA, + IX86_BUILTIN_MPSADBW128, + IX86_BUILTIN_PACKUSDW128, + IX86_BUILTIN_PCMPEQQ, + IX86_BUILTIN_PHMINPOSUW128, + + IX86_BUILTIN_PMAXSB128, + IX86_BUILTIN_PMAXSD128, + IX86_BUILTIN_PMAXUD128, + IX86_BUILTIN_PMAXUW128, + + IX86_BUILTIN_PMINSB128, + IX86_BUILTIN_PMINSD128, + IX86_BUILTIN_PMINUD128, + IX86_BUILTIN_PMINUW128, + + IX86_BUILTIN_PMOVSXBW128, + IX86_BUILTIN_PMOVSXBD128, + IX86_BUILTIN_PMOVSXBQ128, + IX86_BUILTIN_PMOVSXWD128, + IX86_BUILTIN_PMOVSXWQ128, + IX86_BUILTIN_PMOVSXDQ128, + + IX86_BUILTIN_PMOVZXBW128, + IX86_BUILTIN_PMOVZXBD128, + IX86_BUILTIN_PMOVZXBQ128, + IX86_BUILTIN_PMOVZXWD128, + IX86_BUILTIN_PMOVZXWQ128, + IX86_BUILTIN_PMOVZXDQ128, + + IX86_BUILTIN_PMULDQ128, + IX86_BUILTIN_PMULLD128, + + IX86_BUILTIN_ROUNDPD, + IX86_BUILTIN_ROUNDPS, + IX86_BUILTIN_ROUNDSD, + IX86_BUILTIN_ROUNDSS, + + IX86_BUILTIN_PTESTZ, + IX86_BUILTIN_PTESTC, + IX86_BUILTIN_PTESTNZC, + IX86_BUILTIN_VEC_INIT_V2SI, IX86_BUILTIN_VEC_INIT_V4HI, IX86_BUILTIN_VEC_INIT_V8QI, @@ -16422,8 +16501,13 @@ enum ix86_builtins IX86_BUILTIN_VEC_EXT_V8HI, IX86_BUILTIN_VEC_EXT_V2SI, IX86_BUILTIN_VEC_EXT_V4HI, + IX86_BUILTIN_VEC_EXT_V16QI, + IX86_BUILTIN_VEC_SET_V2DI, + IX86_BUILTIN_VEC_SET_V4SF, + IX86_BUILTIN_VEC_SET_V4SI, IX86_BUILTIN_VEC_SET_V8HI, IX86_BUILTIN_VEC_SET_V4HI, + IX86_BUILTIN_VEC_SET_V16QI, IX86_BUILTIN_MAX }; @@ -16508,6 +16592,33 @@ static const struct builtin_description bdesc_comi[] = { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 }, }; +static const struct builtin_description bdesc_ptest[] = +{ + /* SSE4.1 */ + { MASK_SSE4_1, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, 0 }, +}; + +/* SSE builtins with 3 arguments and the last argument must be a 8 bit + constant or xmm0. */ +static const struct builtin_description bdesc_sse_3arg[] = +{ + /* SSE4.1 */ + { MASK_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_roundsd, 0, IX86_BUILTIN_ROUNDSD, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_roundss, 0, IX86_BUILTIN_ROUNDSS, 0, 0 }, +}; + static const struct builtin_description bdesc_2arg[] = { /* SSE */ @@ -16806,7 +16917,21 @@ static const struct builtin_description bdesc_2arg[] = { MASK_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, 0, 0 }, { MASK_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, 0, 0 }, { MASK_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, 0, 0 }, - { MASK_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, 0, 0 } + { MASK_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, 0, 0 }, + + /* SSE4.1 */ + { MASK_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, 0, IX86_BUILTIN_PMULDQ128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, 0, 0 }, }; static const struct builtin_description bdesc_1arg[] = @@ -16861,6 +16986,26 @@ static const struct builtin_description bdesc_1arg[] = { MASK_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, 0, 0 }, { MASK_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, 0, 0 }, { MASK_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, 0, 0 }, + + /* SSE4.1 */ + { MASK_SSE4_1, CODE_FOR_sse4_1_extendv8qiv8hi2, 0, IX86_BUILTIN_PMOVSXBW128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_extendv4qiv4si2, 0, IX86_BUILTIN_PMOVSXBD128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_extendv2qiv2di2, 0, IX86_BUILTIN_PMOVSXBQ128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_extendv4hiv4si2, 0, IX86_BUILTIN_PMOVSXWD128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_extendv2hiv2di2, 0, IX86_BUILTIN_PMOVSXWQ128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_extendv2siv2di2, 0, IX86_BUILTIN_PMOVSXDQ128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, 0, IX86_BUILTIN_PMOVZXBW128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, 0, IX86_BUILTIN_PMOVZXBD128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, 0, IX86_BUILTIN_PMOVZXBQ128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, 0, IX86_BUILTIN_PMOVZXWD128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, 0, IX86_BUILTIN_PMOVZXWQ128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, 0, IX86_BUILTIN_PMOVZXDQ128, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, 0, 0 }, + + /* Fake 1 arg builtins with a constant smaller than 8 bits as the + 2nd arg. */ + { MASK_SSE4_1, CODE_FOR_sse4_1_roundpd, 0, IX86_BUILTIN_ROUNDPD, 0, 0 }, + { MASK_SSE4_1, CODE_FOR_sse4_1_roundps, 0, IX86_BUILTIN_ROUNDPS, 0, 0 }, }; /* Set up all the MMX/SSE builtins. This is not called if TARGET_MMX @@ -17167,6 +17312,55 @@ ix86_init_mmx_sse_builtins (void) tree v2di_ftype_v2di_v16qi = build_function_type_list (V2DI_type_node, V2DI_type_node, V16QI_type_node, NULL_TREE); + tree v2df_ftype_v2df_v2df_v2df + = build_function_type_list (V2DF_type_node, + V2DF_type_node, V2DF_type_node, + V2DF_type_node, NULL_TREE); + tree v4sf_ftype_v4sf_v4sf_v4sf + = build_function_type_list (V4SF_type_node, + V4SF_type_node, V4SF_type_node, + V4SF_type_node, NULL_TREE); + tree v8hi_ftype_v16qi + = build_function_type_list (V8HI_type_node, V16QI_type_node, + NULL_TREE); + tree v4si_ftype_v16qi + = build_function_type_list (V4SI_type_node, V16QI_type_node, + NULL_TREE); + tree v2di_ftype_v16qi + = build_function_type_list (V2DI_type_node, V16QI_type_node, + NULL_TREE); + tree v4si_ftype_v8hi + = build_function_type_list (V4SI_type_node, V8HI_type_node, + NULL_TREE); + tree v2di_ftype_v8hi + = build_function_type_list (V2DI_type_node, V8HI_type_node, + NULL_TREE); + tree v2di_ftype_v4si + = build_function_type_list (V2DI_type_node, V4SI_type_node, + NULL_TREE); + tree v2di_ftype_pv2di + = build_function_type_list (V2DI_type_node, pv2di_type_node, + NULL_TREE); + tree v16qi_ftype_v16qi_v16qi_int + = build_function_type_list (V16QI_type_node, V16QI_type_node, + V16QI_type_node, integer_type_node, + NULL_TREE); + tree v16qi_ftype_v16qi_v16qi_v16qi + = build_function_type_list (V16QI_type_node, V16QI_type_node, + V16QI_type_node, V16QI_type_node, + NULL_TREE); + tree v8hi_ftype_v8hi_v8hi_int + = build_function_type_list (V8HI_type_node, V8HI_type_node, + V8HI_type_node, integer_type_node, + NULL_TREE); + tree v4si_ftype_v4si_v4si_int + = build_function_type_list (V4SI_type_node, V4SI_type_node, + V4SI_type_node, integer_type_node, + NULL_TREE); + tree int_ftype_v2di_v2di + = build_function_type_list (integer_type_node, + V2DI_type_node, V2DI_type_node, + NULL_TREE); tree float80_type; tree float128_type; @@ -17193,6 +17387,64 @@ ix86_init_mmx_sse_builtins (void) (*lang_hooks.types.register_builtin_type) (float128_type, "__float128"); } + /* Add all SSE builtins that are more or less simple operations on + three operands. */ + for (i = 0, d = bdesc_sse_3arg; + i < ARRAY_SIZE (bdesc_sse_3arg); + i++, d++) + { + /* Use one of the operands; the target can have a different mode for + mask-generating compares. */ + enum machine_mode mode; + tree type; + + if (d->name == 0) + continue; + mode = insn_data[d->icode].operand[1].mode; + + switch (mode) + { + case V16QImode: + type = v16qi_ftype_v16qi_v16qi_int; + break; + case V8HImode: + type = v8hi_ftype_v8hi_v8hi_int; + break; + case V4SImode: + type = v4si_ftype_v4si_v4si_int; + break; + case V2DImode: + type = v2di_ftype_v2di_v2di_int; + break; + case V2DFmode: + type = v2df_ftype_v2df_v2df_int; + break; + case V4SFmode: + type = v4sf_ftype_v4sf_v4sf_int; + break; + default: + gcc_unreachable (); + } + + /* Override for variable blends. */ + switch (d->icode) + { + case CODE_FOR_sse4_1_blendvpd: + type = v2df_ftype_v2df_v2df_v2df; + break; + case CODE_FOR_sse4_1_blendvps: + type = v4sf_ftype_v4sf_v4sf_v4sf; + break; + case CODE_FOR_sse4_1_pblendvb: + type = v16qi_ftype_v16qi_v16qi_v16qi; + break; + default: + break; + } + + def_builtin (d->mask, d->name, type, d->code); + } + /* Add all builtins that are more or less simple operations on two operands. */ for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++) @@ -17322,6 +17574,10 @@ ix86_init_mmx_sse_builtins (void) else def_builtin (d->mask, d->name, int_ftype_v4sf_v4sf, d->code); + /* ptest insns. */ + for (i = 0, d = bdesc_ptest; i < ARRAY_SIZE (bdesc_ptest); i++, d++) + def_builtin (d->mask, d->name, int_ftype_v2di_v2di, d->code); + def_builtin (MASK_MMX, "__builtin_ia32_packsswb", v8qi_ftype_v4hi_v4hi, IX86_BUILTIN_PACKSSWB); def_builtin (MASK_MMX, "__builtin_ia32_packssdw", v4hi_ftype_v2si_v2si, IX86_BUILTIN_PACKSSDW); def_builtin (MASK_MMX, "__builtin_ia32_packuswb", v8qi_ftype_v4hi_v4hi, IX86_BUILTIN_PACKUSWB); @@ -17495,6 +17751,44 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSSE3, "__builtin_ia32_palignr", di_ftype_di_di_int, IX86_BUILTIN_PALIGNR); + /* SSE4.1. */ + def_builtin (MASK_SSE4_1, "__builtin_ia32_movntdqa", + v2di_ftype_pv2di, IX86_BUILTIN_MOVNTDQA); + def_builtin (MASK_SSE4_1, "__builtin_ia32_pmovsxbw128", + v8hi_ftype_v16qi, IX86_BUILTIN_PMOVSXBW128); + def_builtin (MASK_SSE4_1, "__builtin_ia32_pmovsxbd128", + v4si_ftype_v16qi, IX86_BUILTIN_PMOVSXBD128); + def_builtin (MASK_SSE4_1, "__builtin_ia32_pmovsxbq128", + v2di_ftype_v16qi, IX86_BUILTIN_PMOVSXBQ128); + def_builtin (MASK_SSE4_1, "__builtin_ia32_pmovsxwd128", + v4si_ftype_v8hi, IX86_BUILTIN_PMOVSXWD128); + def_builtin (MASK_SSE4_1, "__builtin_ia32_pmovsxwq128", + v2di_ftype_v8hi, IX86_BUILTIN_PMOVSXWQ128); + def_builtin (MASK_SSE4_1, "__builtin_ia32_pmovsxdq128", + v2di_ftype_v4si, IX86_BUILTIN_PMOVSXDQ128); + def_builtin (MASK_SSE4_1, "__builtin_ia32_pmovzxbw128", + v8hi_ftype_v16qi, IX86_BUILTIN_PMOVZXBW128); + def_builtin (MASK_SSE4_1, "__builtin_ia32_pmovzxbd128", + v4si_ftype_v16qi, IX86_BUILTIN_PMOVZXBD128); + def_builtin (MASK_SSE4_1, "__builtin_ia32_pmovzxbq128", + v2di_ftype_v16qi, IX86_BUILTIN_PMOVZXBQ128); + def_builtin (MASK_SSE4_1, "__builtin_ia32_pmovzxwd128", + v4si_ftype_v8hi, IX86_BUILTIN_PMOVZXWD128); + def_builtin (MASK_SSE4_1, "__builtin_ia32_pmovzxwq128", + v2di_ftype_v8hi, IX86_BUILTIN_PMOVZXWQ128); + def_builtin (MASK_SSE4_1, "__builtin_ia32_pmovzxdq128", + v2di_ftype_v4si, IX86_BUILTIN_PMOVZXDQ128); + def_builtin (MASK_SSE4_1, "__builtin_ia32_pmuldq128", + v2di_ftype_v4si_v4si, IX86_BUILTIN_PMULDQ128); + def_builtin_const (MASK_SSE4_1, "__builtin_ia32_roundpd", + v2df_ftype_v2df_int, IX86_BUILTIN_ROUNDPD); + def_builtin_const (MASK_SSE4_1, "__builtin_ia32_roundps", + v4sf_ftype_v4sf_int, IX86_BUILTIN_ROUNDPS); + def_builtin_const (MASK_SSE4_1, "__builtin_ia32_roundsd", + v2df_ftype_v2df_v2df_int, IX86_BUILTIN_ROUNDSD); + def_builtin_const (MASK_SSE4_1, "__builtin_ia32_roundss", + v4sf_ftype_v4sf_v4sf_int, IX86_BUILTIN_ROUNDSS); + /* AMDFAM10 SSE4A New built-ins */ def_builtin (MASK_SSE4A, "__builtin_ia32_movntsd", void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTSD); @@ -17567,7 +17861,30 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_MMX, "__builtin_ia32_vec_ext_v2si", ftype, IX86_BUILTIN_VEC_EXT_V2SI); + ftype = build_function_type_list (intQI_type_node, V16QI_type_node, + integer_type_node, NULL_TREE); + def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v16qi", + ftype, IX86_BUILTIN_VEC_EXT_V16QI); + /* Access to the vec_set patterns. */ + ftype = build_function_type_list (V2DI_type_node, V2DI_type_node, + intDI_type_node, + integer_type_node, NULL_TREE); + def_builtin (MASK_SSE4_1, "__builtin_ia32_vec_set_v2di", + ftype, IX86_BUILTIN_VEC_SET_V2DI); + + ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, + float_type_node, + integer_type_node, NULL_TREE); + def_builtin (MASK_SSE4_1, "__builtin_ia32_vec_set_v4sf", + ftype, IX86_BUILTIN_VEC_SET_V4SF); + + ftype = build_function_type_list (V4SI_type_node, V4SI_type_node, + intSI_type_node, + integer_type_node, NULL_TREE); + def_builtin (MASK_SSE4_1, "__builtin_ia32_vec_set_v4si", + ftype, IX86_BUILTIN_VEC_SET_V4SI); + ftype = build_function_type_list (V8HI_type_node, V8HI_type_node, intHI_type_node, integer_type_node, NULL_TREE); @@ -17579,6 +17896,12 @@ ix86_init_mmx_sse_builtins (void) integer_type_node, NULL_TREE); def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_vec_set_v4hi", ftype, IX86_BUILTIN_VEC_SET_V4HI); + + ftype = build_function_type_list (V16QI_type_node, V16QI_type_node, + intQI_type_node, + integer_type_node, NULL_TREE); + def_builtin (MASK_SSE4_1, "__builtin_ia32_vec_set_v16qi", + ftype, IX86_BUILTIN_VEC_SET_V16QI); } static void @@ -17599,6 +17922,74 @@ safe_vector_operand (rtx x, enum machine_mode mode) return x; } +/* Subroutine of ix86_expand_builtin to take care of SSE insns with + 4 operands. The third argument must be a constant smaller than 8 + bits or xmm0. */ + +static rtx +ix86_expand_sse_4_operands_builtin (enum insn_code icode, tree exp, + rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + tree arg2 = CALL_EXPR_ARG (exp, 2); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + rtx op2 = expand_normal (arg2); + enum machine_mode tmode = insn_data[icode].operand[0].mode; + enum machine_mode mode0 = insn_data[icode].operand[1].mode; + enum machine_mode mode1 = insn_data[icode].operand[2].mode; + enum machine_mode mode2; + rtx xmm0; + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if ((optimize && !register_operand (op1, mode1)) + || !(*insn_data[icode].operand[2].predicate) (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + switch (icode) + { + case CODE_FOR_sse4_1_blendvpd: + case CODE_FOR_sse4_1_blendvps: + case CODE_FOR_sse4_1_pblendvb: + /* The third argument of variable blends must be xmm0. */ + xmm0 = gen_rtx_REG (tmode, FIRST_SSE_REG); + emit_move_insn (xmm0, op2); + op2 = xmm0; + break; + default: + mode2 = insn_data[icode].operand[2].mode; + if (! (*insn_data[icode].operand[3].predicate) (op2, mode2)) + { + switch (icode) + { + case CODE_FOR_sse4_1_roundsd: + case CODE_FOR_sse4_1_roundss: + error ("the third argument must be a 4-bit immediate"); + break; + default: + error ("the third argument must be a 8-bit immediate"); + break; + } + return const0_rtx; + } + break; + } + + if (optimize + || target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1, op2); + if (! pat) + return 0; + emit_insn (pat); + return target; +} + /* Subroutine of ix86_expand_builtin to take care of binop insns. */ static rtx @@ -17720,7 +18111,28 @@ ix86_expand_unop_builtin (enum insn_code icode, tree exp, op0 = copy_to_mode_reg (mode0, op0); } - pat = GEN_FCN (icode) (target, op0); + switch (icode) + { + case CODE_FOR_sse4_1_roundpd: + case CODE_FOR_sse4_1_roundps: + { + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op1 = expand_normal (arg1); + enum machine_mode mode1 = insn_data[icode].operand[2].mode; + + if (! (*insn_data[icode].operand[2].predicate) (op1, mode1)) + { + error ("the second argument must be a 4-bit immediate"); + return const0_rtx; + } + pat = GEN_FCN (icode) (target, op0, op1); + } + break; + default: + pat = GEN_FCN (icode) (target, op0); + break; + } + if (! pat) return 0; emit_insn (pat); @@ -17867,6 +18279,50 @@ ix86_expand_sse_comi (const struct builtin_description *d, tree exp, return SUBREG_REG (target); } +/* Subroutine of ix86_expand_builtin to take care of ptest insns. */ + +static rtx +ix86_expand_sse_ptest (const struct builtin_description *d, tree exp, + rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + enum machine_mode mode0 = insn_data[d->icode].operand[0].mode; + enum machine_mode mode1 = insn_data[d->icode].operand[1].mode; + enum rtx_code comparison = d->comparison; + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + if (VECTOR_MODE_P (mode1)) + op1 = safe_vector_operand (op1, mode1); + + target = gen_reg_rtx (SImode); + emit_move_insn (target, const0_rtx); + target = gen_rtx_SUBREG (QImode, target, 0); + + if ((optimize && !register_operand (op0, mode0)) + || !(*insn_data[d->icode].operand[0].predicate) (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if ((optimize && !register_operand (op1, mode1)) + || !(*insn_data[d->icode].operand[1].predicate) (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + pat = GEN_FCN (d->icode) (op0, op1); + if (! pat) + return 0; + emit_insn (pat); + emit_insn (gen_rtx_SET (VOIDmode, + gen_rtx_STRICT_LOW_PART (VOIDmode, target), + gen_rtx_fmt_ee (comparison, QImode, + SET_DEST (pat), + const0_rtx))); + + return SUBREG_REG (target); +} + /* Return the integer constant in ARG. Constrain it to be in the range of the subparts of VEC_TYPE; issue an error if not. */ @@ -18522,6 +18978,10 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, emit_insn (pat); return target; + case IX86_BUILTIN_MOVNTDQA: + return ix86_expand_unop_builtin (CODE_FOR_sse4_1_movntdqa, exp, + target, 1); + case IX86_BUILTIN_MOVNTSD: return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv2df, exp); @@ -18642,16 +19102,28 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, case IX86_BUILTIN_VEC_EXT_V8HI: case IX86_BUILTIN_VEC_EXT_V2SI: case IX86_BUILTIN_VEC_EXT_V4HI: + case IX86_BUILTIN_VEC_EXT_V16QI: return ix86_expand_vec_ext_builtin (exp, target); + case IX86_BUILTIN_VEC_SET_V2DI: + case IX86_BUILTIN_VEC_SET_V4SF: + case IX86_BUILTIN_VEC_SET_V4SI: case IX86_BUILTIN_VEC_SET_V8HI: case IX86_BUILTIN_VEC_SET_V4HI: + case IX86_BUILTIN_VEC_SET_V16QI: return ix86_expand_vec_set_builtin (exp); default: break; } + for (i = 0, d = bdesc_sse_3arg; + i < ARRAY_SIZE (bdesc_sse_3arg); + i++, d++) + if (d->code == fcode) + return ix86_expand_sse_4_operands_builtin (d->icode, exp, + target); + for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++) if (d->code == fcode) { @@ -18673,6 +19145,10 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, if (d->code == fcode) return ix86_expand_sse_comi (d, exp, target); + for (i = 0, d = bdesc_ptest; i < ARRAY_SIZE (bdesc_ptest); i++, d++) + if (d->code == fcode) + return ix86_expand_sse_ptest (d, exp, target); + gcc_unreachable (); } @@ -20877,8 +21353,12 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) } break; - case V2DFmode: case V2DImode: + use_vec_merge = TARGET_SSE4_1; + if (use_vec_merge) + break; + + case V2DFmode: { rtx op0, op1; @@ -20899,6 +21379,10 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) return; case V4SFmode: + use_vec_merge = TARGET_SSE4_1; + if (use_vec_merge) + break; + switch (elt) { case 0: @@ -20946,6 +21430,10 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) break; case V4SImode: + use_vec_merge = TARGET_SSE4_1; + if (use_vec_merge) + break; + /* Element 0 handled by vec_merge below. */ if (elt == 0) { @@ -20990,6 +21478,9 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) break; case V16QImode: + use_vec_merge = TARGET_SSE4_1; + break; + case V8QImode: default: break; @@ -21036,6 +21527,10 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) break; case V4SFmode: + use_vec_extr = TARGET_SSE4_1; + if (use_vec_extr) + break; + switch (elt) { case 0: @@ -21064,6 +21559,10 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) break; case V4SImode: + use_vec_extr = TARGET_SSE4_1; + if (use_vec_extr) + break; + if (TARGET_SSE2) { switch (elt) @@ -21109,6 +21608,9 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) break; case V16QImode: + use_vec_extr = TARGET_SSE4_1; + break; + case V8QImode: /* ??? Could extract the appropriate HImode element and shift. */ default: @@ -21121,7 +21623,7 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp); /* Let the rtl optimizers know about the zero extension performed. */ - if (inner_mode == HImode) + if (inner_mode == QImode || inner_mode == HImode) { tmp = gen_rtx_ZERO_EXTEND (SImode, tmp); target = gen_lowpart (SImode, target); diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 21ed8ad0ff2..e4cf24981ec 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -540,6 +540,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); builtin_define ("__SSE3__"); \ if (TARGET_SSSE3) \ builtin_define ("__SSSE3__"); \ + if (TARGET_SSE4_1) \ + builtin_define ("__SSE4_1__"); \ if (TARGET_SSE4A) \ builtin_define ("__SSE4A__"); \ if (TARGET_SSE_MATH && TARGET_SSE) \ diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 5beb901c60d..e4b2c86f13b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -162,6 +162,17 @@ (UNSPEC_EXTRQ 131) (UNSPEC_INSERTQI 132) (UNSPEC_INSERTQ 133) + + ; For SSE4.1 support + (UNSPEC_BLENDV 134) + (UNSPEC_INSERTPS 135) + (UNSPEC_DP 136) + (UNSPEC_MOVNTDQA 137) + (UNSPEC_MPSADBW 138) + (UNSPEC_PHMINPOSUW 139) + (UNSPEC_PTEST 140) + (UNSPEC_ROUNDP 141) + (UNSPEC_ROUNDS 142) ]) (define_constants diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 7362601a562..ac60526bbf6 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -187,6 +187,10 @@ mssse3 Target Report Mask(SSSE3) Support MMX, SSE, SSE2, SSE3 and SSSE3 built-in functions and code generation +msse4.1 +Target Report Mask(SSE4_1) +Support MMX, SSE, SSE2, SSE3, SSSE3 and SSE4.1 built-in functions and code generation + msse4a Target Report Mask(SSE4A) Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index efa5c98ee79..5dcc24b68d2 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -623,6 +623,11 @@ (and (match_code "const_int") (match_test "IN_RANGE (INTVAL (op), 4, 7)"))) +;; Match exactly one bit in 2-bit mask. +(define_predicate "const_pow2_1_to_2_operand" + (and (match_code "const_int") + (match_test "INTVAL (op) == 1 || INTVAL (op) == 2"))) + ;; Match exactly one bit in 4-bit mask. (define_predicate "const_pow2_1_to_8_operand" (match_code "const_int") @@ -639,6 +644,14 @@ return log <= 7; }) +;; Match exactly one bit in 16-bit mask. +(define_predicate "const_pow2_1_to_32768_operand" + (match_code "const_int") +{ + unsigned int log = exact_log2 (INTVAL (op)); + return log <= 15; +}) + ;; True if this is a constant appropriate for an increment or decrement. (define_predicate "incdec_operand" (match_code "const_int") diff --git a/gcc/config/i386/smmintrin.h b/gcc/config/i386/smmintrin.h new file mode 100644 index 00000000000..d57e2e6640d --- /dev/null +++ b/gcc/config/i386/smmintrin.h @@ -0,0 +1,578 @@ +/* Copyright (C) 2007 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING. If not, write to + the Free Software Foundation, 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* As a special exception, if you include this header file into source + files compiled by GCC, this header file does not by itself cause + the resulting executable to be covered by the GNU General Public + License. This exception does not however invalidate any other + reasons why the executable file might be covered by the GNU General + Public License. */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 10.0. */ + +#ifndef _SMMINTRIN_H_INCLUDED +#define _SMMINTRIN_H_INCLUDED + +#ifndef __SSE4_1__ +# error "SSE4.1 instruction set not enabled" +#else + +/* We need definitions from the SSSE3, SSE3, SSE2 and SSE header + files. */ +#include + +/* SSE4.1 */ + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 + +#define _MM_FROUND_RAISE_EXC 0x00 +#define _MM_FROUND_NO_EXC 0x08 + +#define _MM_FROUND_NINT \ + (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_FLOOR \ + (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_CEIL \ + (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_TRUNC \ + (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_RINT \ + (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_NEARBYINT \ + (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) + +/* Integer blend instructions - select data from 2 sources using + constant/variable mask. */ + +#ifdef __OPTIMIZE__ +static __inline __m128i __attribute__((__always_inline__)) +_mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X, + (__v8hi)__Y, + __M); +} +#else +#define _mm_blend_epi16(X, Y, M) \ + ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(X), (__v8hi)(Y), (M))) +#endif + +static __inline __m128i __attribute__((__always_inline__)) +_mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M) +{ + return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X, + (__v16qi)__Y, + (__v16qi)__M); +} + +/* Single precision floating point blend instructions - select data + from 2 sources using constant/variable mask. */ + +#ifdef __OPTIMIZE__ +static __inline __m128 __attribute__((__always_inline__)) +_mm_blend_ps (__m128 __X, __m128 __Y, const int __M) +{ + return (__m128) __builtin_ia32_blendps ((__v4sf)__X, + (__v4sf)__Y, + __M); +} +#else +#define _mm_blend_ps(X, Y, M) \ + ((__m128) __builtin_ia32_blendps ((__v4sf)(X), (__v4sf)(Y), (M))) +#endif + +static __inline __m128 __attribute__((__always_inline__)) +_mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M) +{ + return (__m128) __builtin_ia32_blendvps ((__v4sf)__X, + (__v4sf)__Y, + (__v4sf)__M); +} + +/* Double precision floating point blend instructions - select data + from 2 sources using constant/variable mask. */ + +#ifdef __OPTIMIZE__ +static __inline __m128d __attribute__((__always_inline__)) +_mm_blend_pd (__m128d __X, __m128d __Y, const int __M) +{ + return (__m128d) __builtin_ia32_blendpd ((__v2df)__X, + (__v2df)__Y, + __M); +} +#else +#define _mm_blend_pd(X, Y, M) \ + ((__m128d) __builtin_ia32_blendpd ((__v2df)(X), (__v2df)(Y), (M))) +#endif + +static __inline __m128d __attribute__((__always_inline__)) +_mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M) +{ + return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X, + (__v2df)__Y, + (__v2df)__M); +} + +/* Dot product instructions with mask-defined summing and zeroing parts + of result. */ + +#ifdef __OPTIMIZE__ +static __inline __m128 __attribute__((__always_inline__)) +_mm_dp_ps (__m128 __X, __m128 __Y, const int __M) +{ + return (__m128) __builtin_ia32_dpps ((__v4sf)__X, + (__v4sf)__Y, + __M); +} + +static __inline __m128d __attribute__((__always_inline__)) +_mm_dp_pd (__m128d __X, __m128d __Y, const int __M) +{ + return (__m128d) __builtin_ia32_dppd ((__v2df)__X, + (__v2df)__Y, + __M); +} +#else +#define _mm_dp_ps(X, Y, M) \ + ((__m128) __builtin_ia32_dpps ((__v4sf)(X), (__v4sf)(Y), (M))) + +#define _mm_dp_pd(X, Y, M) \ + ((__m128d) __builtin_ia32_dppd ((__v2df)(X), (__v2df)(Y), (M))) +#endif + +/* Packed integer 64-bit comparison, zeroing or filling with ones + corresponding parts of result. */ +static __inline __m128i __attribute__((__always_inline__)) +_mm_cmpeq_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pcmpeqq ((__v2di)__X, (__v2di)__Y); +} + +/* Min/max packed integer instructions. */ + +static __inline __m128i __attribute__((__always_inline__)) +_mm_min_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_max_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_min_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_max_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_min_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_max_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_min_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_max_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y); +} + +/* Packed integer 32-bit multiplication with truncation of upper + halves of results. */ +static __inline __m128i __attribute__((__always_inline__)) +_mm_mullo_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X, (__v4si)__Y); +} + +/* Packed integer 32-bit multiplication of 2 pairs of operands + with two 64-bit results. */ +static __inline __m128i __attribute__((__always_inline__)) +_mm_mul_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y); +} + +/* Packed integer 128-bit bitwise comparison. Return 1 if + (__V & __M) == 0. */ +static __inline int __attribute__((__always_inline__)) +_mm_testz_si128 (__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V); +} + +/* Packed integer 128-bit bitwise comparison. Return 1 if + (__V & ~__M) == 0. */ +static __inline int __attribute__((__always_inline__)) +_mm_testc_si128 (__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V); +} + +/* Packed integer 128-bit bitwise comparison. Return 1 if + (__V & __M) != 0 && (__V & ~__M) != 0. */ +static __inline int __attribute__((__always_inline__)) +_mm_testnzc_si128 (__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V); +} + +/* Macros for packed integer 128-bit comparison intrinsics. */ +#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) + +#define _mm_test_all_ones(V) \ + _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V))) + +#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V)) + +/* Insert single precision float into packed single precision array + element selected by index N. The bits [7-6] of N define S + index, the bits [5-4] define D index, and bits [3-0] define + zeroing mask for D. */ + +#ifdef __OPTIMIZE__ +static __inline __m128 __attribute__((__always_inline__)) +_mm_insert_ps (__m128 __D, __m128 __S, const int __N) +{ + return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D, + (__v4sf)__S, + __N); +} +#else +#define _mm_insert_ps(D, S, N) \ + ((__m128) __builtin_ia32_insertps128 ((__v4sf)(D), (__v4sf)(S), (N))) +#endif + +/* Helper macro to create the N value for _mm_insert_ps. */ +#define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M)) + +/* Extract binary representation of single precision float from packed + single precision array element of X selected by index N. */ + +#ifdef __OPTIMIZE__ +static __inline int __attribute__((__always_inline__)) +_mm_extract_ps (__m128 __X, const int __N) +{ + union { int i; float f; } __tmp; + __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N); + return __tmp.i; +} +#else +#define _mm_extract_ps(X, N) \ + (__extension__ \ + ({ \ + union { int i; float f; } __tmp; \ + __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(X), (N)); \ + __tmp.i; \ + }) \ + ) +#endif + +/* Extract binary representation of single precision float into + D from packed single precision array element of S selected + by index N. */ +#define _MM_EXTRACT_FLOAT(D, S, N) \ + { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); } + +/* Extract specified single precision float element into the lower + part of __m128. */ +#define _MM_PICK_OUT_PS(X, N) \ + _mm_insert_ps (_mm_setzero_ps (), (X), \ + _MM_MK_INSERTPS_NDX ((N), 0, 0x0e)) + +/* Insert integer, S, into packed integer array element of D + selected by index N. */ + +#ifdef __OPTIMIZE__ +static __inline __m128i __attribute__((__always_inline__)) +_mm_insert_epi8 (__m128i __D, int __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D, + __S, __N); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_insert_epi32 (__m128i __D, int __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D, + __S, __N); +} + +#ifdef __x86_64__ +static __inline __m128i __attribute__((__always_inline__)) +_mm_insert_epi64 (__m128i __D, long long __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D, + __S, __N); +} +#endif +#else +#define _mm_insert_epi8(D, S, N) \ + ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(D), (S), (N))) + +#define _mm_insert_epi32(D, S, N) \ + ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(D), (S), (N))) + +#ifdef __x86_64__ +#define _mm_insert_epi64(D, S, N) \ + ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(D), (S), (N))) +#endif +#endif + +/* Extract integer from packed integer array element of X selected by + index N. */ + +#ifdef __OPTIMIZE__ +static __inline int __attribute__((__always_inline__)) +_mm_extract_epi8 (__m128i __X, const int __N) +{ + return __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N); +} + +static __inline int __attribute__((__always_inline__)) +_mm_extract_epi32 (__m128i __X, const int __N) +{ + return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N); +} + +#ifdef __x86_64__ +static __inline long long __attribute__((__always_inline__)) +_mm_extract_epi64 (__m128i __X, const int __N) +{ + return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N); +} +#endif +#else +#define _mm_extract_epi8(X, N) \ + __builtin_ia32_vec_ext_v16qi ((__v16qi) X, (N)) +#define _mm_extract_epi32(X, N) \ + __builtin_ia32_vec_ext_v4si ((__v4si) X, (N)) + +#ifdef __x86_64__ +#define _mm_extract_epi64(X, N) \ + ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(X), (N))) +#endif +#endif + +/* Return horizontal packed word minimum and its index in bits [15:0] + and bits [18:16] respectively. */ +static __inline __m128i __attribute__((__always_inline__)) +_mm_minpos_epu16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X); +} + +/* Packed/scalar double precision floating point rounding. */ + +#ifdef __OPTIMIZE__ +static __inline __m128d __attribute__((__always_inline__)) +_mm_round_pd (__m128d __V, const int __M) +{ + return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M); +} + +static __inline __m128d __attribute__((__always_inline__)) +_mm_round_sd(__m128d __D, __m128d __V, const int __M) +{ + return (__m128d) __builtin_ia32_roundsd ((__v2df)__D, + (__v2df)__V, + __M); +} +#else +#define _mm_round_pd(V, M) \ + ((__m128d) __builtin_ia32_roundpd ((__v2df)(V), (M))) + +#define _mm_round_sd(D, V, M) \ + ((__m128d) __builtin_ia32_roundsd ((__v2df)(D), (__v2df)(V), (M))) +#endif + +/* Packed/scalar single precision floating point rounding. */ + +#ifdef __OPTIMIZE__ +static __inline __m128 __attribute__((__always_inline__)) +_mm_round_ps (__m128 __V, const int __M) +{ + return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M); +} + +static __inline __m128 __attribute__((__always_inline__)) +_mm_round_ss (__m128 __D, __m128 __V, const int __M) +{ + return (__m128) __builtin_ia32_roundss ((__v4sf)__D, + (__v4sf)__V, + __M); +} +#else +#define _mm_round_ps(V, M) \ + ((__m128) __builtin_ia32_roundps ((__v4sf)(V), (M))) + +#define _mm_round_ss(D, V, M) \ + ((__m128) __builtin_ia32_roundss ((__v4sf)(D), (__v4sf)(V), (M))) +#endif + +/* Macros for ceil/floor intrinsics. */ +#define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL) +#define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) +#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR) + +#define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL) +#define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR) +#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR) + +/* Packed integer sign-extension. */ + +static __inline __m128i __attribute__((__always_inline__)) +_mm_cvtepi8_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_cvtepi16_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_cvtepi8_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_cvtepi32_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_cvtepi16_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_cvtepi8_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X); +} + +/* Packed integer zero-extension. */ + +static __inline __m128i __attribute__((__always_inline__)) +_mm_cvtepu8_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_cvtepu16_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_cvtepu8_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_cvtepu32_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_cvtepu16_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_cvtepu8_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X); +} + +/* Pack 8 double words from 2 operands into 8 words of result with + unsigned saturation. */ +static __inline __m128i __attribute__((__always_inline__)) +_mm_packus_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y); +} + +/* Sum absolute 8-bit integer difference of adjacent groups of 4 + byte integers in the first 2 operands. Starting offsets within + operands are determined by the 3rd mask operand. */ + +#ifdef __OPTIMIZE__ +static __inline __m128i __attribute__((__always_inline__)) +_mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X, + (__v16qi)__Y, __M); +} +#else +#define _mm_mpsadbw_epu8(X, Y, M) \ + ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(X), (__v16qi)(Y), (M))) +#endif + +/* Load double quadword using non-temporal aligned hint. */ +static __inline __m128i __attribute__((__always_inline__)) +_mm_stream_load_si128 (__m128i *__X) +{ + return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X); +} + +#endif /* __SSE4_1__ */ + +#endif /* _SMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index aa581ac1443..661f5bb5c8d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1379,6 +1379,35 @@ [(set_attr "type" "ssemov") (set_attr "mode" "SF")]) +;; A subset is vec_setv4sf. +(define_insn "*vec_setv4sf_sse4_1" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (match_operand:SF 2 "nonimmediate_operand" "xm")) + (match_operand:V4SF 1 "register_operand" "0") + (match_operand:SI 3 "const_pow2_1_to_8_operand" "n")))] + "TARGET_SSE4_1" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])) << 4); + return "insertps\t{%3, %2, %0|%0, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V4SF")]) + +(define_insn "sse4_1_insertps" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (unspec:V4SF [(match_operand:V4SF 2 "register_operand" "x") + (match_operand:V4SF 1 "register_operand" "0") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_INSERTPS))] + "TARGET_SSE4_1" + "insertps\t{%3, %2, %0|%0, %2, %3}"; + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V4SF")]) + (define_split [(set (match_operand:V4SF 0 "memory_operand" "") (vec_merge:V4SF @@ -1423,6 +1452,17 @@ DONE; }) +(define_insn "*sse4_1_extractps" + [(set (match_operand:SF 0 "register_operand" "=rm") + (vec_select:SF + (match_operand:V4SF 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")])))] + "TARGET_SSE4_1" + "extractps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V4SF")]) + (define_expand "vec_extractv4sf" [(match_operand:SF 0 "register_operand" "") (match_operand:V4SF 1 "register_operand" "") @@ -2880,6 +2920,23 @@ (set_attr "prefix_data16" "1") (set_attr "mode" "TI")]) +(define_insn "sse4_1_mulv2siv2di3" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "%0") + (parallel [(const_int 0) (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) (const_int 2)])))))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, V4SImode, operands)" + "pmuldq\t{%2, %0|%0, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + (define_insn "sse2_pmaddwd" [(set (match_operand:V4SI 0 "register_operand" "=x") (plus:V4SI @@ -2923,46 +2980,64 @@ (match_operand:V4SI 2 "register_operand" "")))] "TARGET_SSE2" { - rtx t1, t2, t3, t4, t5, t6, thirtytwo; - rtx op0, op1, op2; + if (TARGET_SSE4_1) + ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands); + else + { + rtx t1, t2, t3, t4, t5, t6, thirtytwo; + rtx op0, op1, op2; - op0 = operands[0]; - op1 = operands[1]; - op2 = operands[2]; - t1 = gen_reg_rtx (V4SImode); - t2 = gen_reg_rtx (V4SImode); - t3 = gen_reg_rtx (V4SImode); - t4 = gen_reg_rtx (V4SImode); - t5 = gen_reg_rtx (V4SImode); - t6 = gen_reg_rtx (V4SImode); - thirtytwo = GEN_INT (32); + op0 = operands[0]; + op1 = operands[1]; + op2 = operands[2]; + t1 = gen_reg_rtx (V4SImode); + t2 = gen_reg_rtx (V4SImode); + t3 = gen_reg_rtx (V4SImode); + t4 = gen_reg_rtx (V4SImode); + t5 = gen_reg_rtx (V4SImode); + t6 = gen_reg_rtx (V4SImode); + thirtytwo = GEN_INT (32); - /* Multiply elements 2 and 0. */ - emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1), op1, op2)); + /* Multiply elements 2 and 0. */ + emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1), + op1, op2)); - /* Shift both input vectors down one element, so that elements 3 and 1 - are now in the slots for elements 2 and 0. For K8, at least, this is - faster than using a shuffle. */ - emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t2), - gen_lowpart (TImode, op1), thirtytwo)); - emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t3), - gen_lowpart (TImode, op2), thirtytwo)); + /* Shift both input vectors down one element, so that elements 3 + and 1 are now in the slots for elements 2 and 0. For K8, at + least, this is faster than using a shuffle. */ + emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t2), + gen_lowpart (TImode, op1), + thirtytwo)); + emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t3), + gen_lowpart (TImode, op2), + thirtytwo)); + /* Multiply elements 3 and 1. */ + emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4), + t2, t3)); - /* Multiply elements 3 and 1. */ - emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4), t2, t3)); - - /* Move the results in element 2 down to element 1; we don't care what - goes in elements 2 and 3. */ - emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx, - const0_rtx, const0_rtx)); - emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx, + /* Move the results in element 2 down to element 1; we don't care + what goes in elements 2 and 3. */ + emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx, const0_rtx, const0_rtx)); + emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx, + const0_rtx, const0_rtx)); - /* Merge the parts back together. */ - emit_insn (gen_sse2_punpckldq (op0, t5, t6)); - DONE; + /* Merge the parts back together. */ + emit_insn (gen_sse2_punpckldq (op0, t5, t6)); + DONE; + } }) +(define_insn "*sse4_1_mulv4si3" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "%0") + (match_operand:V4SI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, V4SImode, operands)" + "pmulld\t{%2, %0|%0, %2}" + [(set_attr "type" "sseimul") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + (define_expand "mulv2di3" [(set (match_operand:V2DI 0 "register_operand" "") (mult:V2DI (match_operand:V2DI 1 "register_operand" "") @@ -3323,16 +3398,22 @@ (set_attr "mode" "TI")]) (define_expand "umaxv8hi3" - [(set (match_operand:V8HI 0 "register_operand" "=x") - (us_minus:V8HI (match_operand:V8HI 1 "register_operand" "0") - (match_operand:V8HI 2 "nonimmediate_operand" "xm"))) - (set (match_dup 3) - (plus:V8HI (match_dup 0) (match_dup 2)))] + [(set (match_operand:V8HI 0 "register_operand" "") + (umax:V8HI (match_operand:V8HI 1 "register_operand" "") + (match_operand:V8HI 2 "nonimmediate_operand" "")))] "TARGET_SSE2" { - operands[3] = operands[0]; - if (rtx_equal_p (operands[0], operands[2])) - operands[0] = gen_reg_rtx (V8HImode); + if (TARGET_SSE4_1) + ix86_fixup_binary_operands_no_copy (UMAX, V8HImode, operands); + else + { + rtx op0 = operands[0], op2 = operands[2], op3 = op0; + if (rtx_equal_p (op3, op2)) + op3 = gen_reg_rtx (V8HImode); + emit_insn (gen_sse2_ussubv8hi3 (op3, operands[1], op2)); + emit_insn (gen_addv8hi3 (op0, op3, op2)); + DONE; + } }) (define_expand "smax3" @@ -3341,40 +3422,72 @@ (match_operand:SSEMODE14 2 "register_operand" "")))] "TARGET_SSE2" { - rtx xops[6]; - bool ok; + if (TARGET_SSE4_1) + ix86_fixup_binary_operands_no_copy (SMAX, mode, operands); + else + { + rtx xops[6]; + bool ok; - xops[0] = operands[0]; - xops[1] = operands[1]; - xops[2] = operands[2]; - xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]); - xops[4] = operands[1]; - xops[5] = operands[2]; - ok = ix86_expand_int_vcond (xops); - gcc_assert (ok); - DONE; + xops[0] = operands[0]; + xops[1] = operands[1]; + xops[2] = operands[2]; + xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]); + xops[4] = operands[1]; + xops[5] = operands[2]; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); + DONE; + } }) +(define_insn "*sse4_1_smax3" + [(set (match_operand:SSEMODE14 0 "register_operand" "=x") + (smax:SSEMODE14 + (match_operand:SSEMODE14 1 "nonimmediate_operand" "%0") + (match_operand:SSEMODE14 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (SMAX, mode, operands)" + "pmaxs\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + (define_expand "umaxv4si3" [(set (match_operand:V4SI 0 "register_operand" "") (umax:V4SI (match_operand:V4SI 1 "register_operand" "") (match_operand:V4SI 2 "register_operand" "")))] "TARGET_SSE2" { - rtx xops[6]; - bool ok; + if (TARGET_SSE4_1) + ix86_fixup_binary_operands_no_copy (UMAX, V4SImode, operands); + else + { + rtx xops[6]; + bool ok; - xops[0] = operands[0]; - xops[1] = operands[1]; - xops[2] = operands[2]; - xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]); - xops[4] = operands[1]; - xops[5] = operands[2]; - ok = ix86_expand_int_vcond (xops); - gcc_assert (ok); - DONE; + xops[0] = operands[0]; + xops[1] = operands[1]; + xops[2] = operands[2]; + xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]); + xops[4] = operands[1]; + xops[5] = operands[2]; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); + DONE; + } }) +(define_insn "*sse4_1_umax3" + [(set (match_operand:SSEMODE24 0 "register_operand" "=x") + (umax:SSEMODE24 + (match_operand:SSEMODE24 1 "nonimmediate_operand" "%0") + (match_operand:SSEMODE24 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (UMAX, mode, operands)" + "pmaxu\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + (define_expand "uminv16qi3" [(set (match_operand:V16QI 0 "register_operand" "") (umin:V16QI (match_operand:V16QI 1 "nonimmediate_operand" "") @@ -3415,40 +3528,72 @@ (match_operand:SSEMODE14 2 "register_operand" "")))] "TARGET_SSE2" { - rtx xops[6]; - bool ok; + if (TARGET_SSE4_1) + ix86_fixup_binary_operands_no_copy (SMIN, mode, operands); + else + { + rtx xops[6]; + bool ok; - xops[0] = operands[0]; - xops[1] = operands[2]; - xops[2] = operands[1]; - xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]); - xops[4] = operands[1]; - xops[5] = operands[2]; - ok = ix86_expand_int_vcond (xops); - gcc_assert (ok); - DONE; + xops[0] = operands[0]; + xops[1] = operands[2]; + xops[2] = operands[1]; + xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]); + xops[4] = operands[1]; + xops[5] = operands[2]; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); + DONE; + } }) +(define_insn "*sse4_1_smin3" + [(set (match_operand:SSEMODE14 0 "register_operand" "=x") + (smin:SSEMODE14 + (match_operand:SSEMODE14 1 "nonimmediate_operand" "%0") + (match_operand:SSEMODE14 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (SMIN, mode, operands)" + "pmins\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + (define_expand "umin3" [(set (match_operand:SSEMODE24 0 "register_operand" "") (umin:SSEMODE24 (match_operand:SSEMODE24 1 "register_operand" "") (match_operand:SSEMODE24 2 "register_operand" "")))] "TARGET_SSE2" { - rtx xops[6]; - bool ok; + if (TARGET_SSE4_1) + ix86_fixup_binary_operands_no_copy (UMIN, mode, operands); + else + { + rtx xops[6]; + bool ok; - xops[0] = operands[0]; - xops[1] = operands[2]; - xops[2] = operands[1]; - xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]); - xops[4] = operands[1]; - xops[5] = operands[2]; - ok = ix86_expand_int_vcond (xops); - gcc_assert (ok); - DONE; + xops[0] = operands[0]; + xops[1] = operands[2]; + xops[2] = operands[1]; + xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]); + xops[4] = operands[1]; + xops[5] = operands[2]; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); + DONE; + } }) +(define_insn "*sse4_1_umin3" + [(set (match_operand:SSEMODE24 0 "register_operand" "=x") + (umin:SSEMODE24 + (match_operand:SSEMODE24 1 "nonimmediate_operand" "%0") + (match_operand:SSEMODE24 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (UMIN, mode, operands)" + "pminu\t{%2, %0|%0, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral comparisons @@ -3466,6 +3611,17 @@ (set_attr "prefix_data16" "1") (set_attr "mode" "TI")]) +(define_insn "sse4_1_eqv2di3" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (eq:V2DI + (match_operand:V2DI 1 "nonimmediate_operand" "%0") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE4_1 && ix86_binary_operator_ok (EQ, V2DImode, operands)" + "pcmpeqq\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecmp") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + (define_insn "sse2_gt3" [(set (match_operand:SSEMODE124 0 "register_operand" "=x") (gt:SSEMODE124 @@ -3989,6 +4145,22 @@ (set_attr "prefix_data16" "1") (set_attr "mode" "TI")]) +(define_insn "*sse4_1_pinsrb" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (vec_merge:V16QI + (vec_duplicate:V16QI + (match_operand:QI 2 "nonimmediate_operand" "rm")) + (match_operand:V16QI 1 "register_operand" "0") + (match_operand:SI 3 "const_pow2_1_to_32768_operand" "n")))] + "TARGET_SSE4_1" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); + return "pinsrb\t{%3, %k2, %0|%0, %k2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + (define_insn "*sse2_pinsrw" [(set (match_operand:V8HI 0 "register_operand" "=x") (vec_merge:V8HI @@ -4005,6 +4177,62 @@ (set_attr "prefix_data16" "1") (set_attr "mode" "TI")]) +;; It must come before sse2_loadld since it is preferred. +(define_insn "*sse4_1_pinsrd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_merge:V4SI + (vec_duplicate:V4SI + (match_operand:SI 2 "nonimmediate_operand" "rm")) + (match_operand:V4SI 1 "register_operand" "0") + (match_operand:SI 3 "const_pow2_1_to_8_operand" "n")))] + "TARGET_SSE4_1" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); + return "pinsrd\t{%3, %2, %0|%0, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_pinsrq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (vec_merge:V2DI + (vec_duplicate:V2DI + (match_operand:DI 2 "nonimmediate_operand" "rm")) + (match_operand:V2DI 1 "register_operand" "0") + (match_operand:SI 3 "const_pow2_1_to_2_operand" "n")))] + "TARGET_SSE4_1" +{ + operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); + return "pinsrq\t{%3, %2, %0|%0, %2, %3}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_pextrb" + [(set (match_operand:SI 0 "register_operand" "=r") + (zero_extend:SI + (vec_select:QI + (match_operand:V16QI 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_15_operand" "n")]))))] + "TARGET_SSE4_1" + "pextrb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_pextrb_memory" + [(set (match_operand:QI 0 "memory_operand" "=m") + (vec_select:QI + (match_operand:V16QI 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_15_operand" "n")])))] + "TARGET_SSE4_1" + "pextrb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + (define_insn "*sse2_pextrw" [(set (match_operand:SI 0 "register_operand" "=r") (zero_extend:SI @@ -4017,6 +4245,40 @@ (set_attr "prefix_data16" "1") (set_attr "mode" "TI")]) +(define_insn "*sse4_1_pextrw_memory" + [(set (match_operand:HI 0 "memory_operand" "=m") + (vec_select:HI + (match_operand:V8HI 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_7_operand" "n")])))] + "TARGET_SSE4_1" + "pextrw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_pextrd" + [(set (match_operand:SI 0 "nonimmediate_operand" "=rm") + (vec_select:SI + (match_operand:V4SI 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")])))] + "TARGET_SSE4_1" + "pextrd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +;; It must come before *vec_extractv2di_1_sse since it is preferred. +(define_insn "*sse4_1_pextrq" + [(set (match_operand:DI 0 "nonimmediate_operand" "=rm") + (vec_select:DI + (match_operand:V2DI 1 "register_operand" "x") + (parallel [(match_operand:SI 2 "const_0_to_1_operand" "n")])))] + "TARGET_SSE4_1 && TARGET_64BIT" + "pextrq\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + (define_expand "sse2_pshufd" [(match_operand:V4SI 0 "register_operand" "") (match_operand:V4SI 1 "nonimmediate_operand" "") @@ -5500,3 +5762,577 @@ [(set_attr "type" "sseins") (set_attr "prefix_rep" "1") (set_attr "mode" "TI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; Intel SSE4.1 instructions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "sse4_1_blendpd" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (vec_merge:V2DF + (match_operand:V2DF 2 "nonimmediate_operand" "xm") + (match_operand:V2DF 1 "register_operand" "0") + (match_operand:SI 3 "const_0_to_3_operand" "n")))] + "TARGET_SSE4_1" + "blendpd\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2DF")]) + +(define_insn "sse4_1_blendps" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (match_operand:V4SF 2 "nonimmediate_operand" "xm") + (match_operand:V4SF 1 "register_operand" "0") + (match_operand:SI 3 "const_0_to_15_operand" "n")))] + "TARGET_SSE4_1" + "blendps\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V4SF")]) + +(define_insn "sse4_1_blendvpd" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (unspec:V2DF [(match_operand:V2DF 1 "register_operand" "0") + (match_operand:V2DF 2 "nonimmediate_operand" "xm") + (reg:V2DF 21)] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "blendvpd\t{%%xmm0, %2, %0|%0, %2, %%xmm0}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2DF")]) + +(define_insn "sse4_1_blendvps" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "0") + (match_operand:V4SF 2 "nonimmediate_operand" "xm") + (reg:V4SF 21)] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "blendvps\t{%%xmm0, %2, %0|%0, %2, %%xmm0}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V4SF")]) + +(define_insn "sse4_1_dppd" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (unspec:V2DF [(match_operand:V2DF 1 "nonimmediate_operand" "%0") + (match_operand:V2DF 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_DP))] + "TARGET_SSE4_1" + "dppd\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssemul") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2DF")]) + +(define_insn "sse4_1_dpps" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "%0") + (match_operand:V4SF 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_DP))] + "TARGET_SSE4_1" + "dpps\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssemul") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V4SF")]) + +(define_insn "sse4_1_movntdqa" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "memory_operand" "m")] + UNSPEC_MOVNTDQA))] + "TARGET_SSE4_1" + "movntdqa\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_mpsadbw" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0") + (match_operand:V16QI 2 "nonimmediate_operand" "xm") + (match_operand:SI 3 "const_0_to_255_operand" "n")] + UNSPEC_MPSADBW))] + "TARGET_SSE4_1" + "mpsadbw\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_packusdw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_concat:V8HI + (us_truncate:V4HI + (match_operand:V4SI 1 "register_operand" "0")) + (us_truncate:V4HI + (match_operand:V4SI 2 "nonimmediate_operand" "xm"))))] + "TARGET_SSE4_1" + "packusdw\t{%2, %0|%0, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_pblendvb" + [(set (match_operand:V16QI 0 "register_operand" "=x") + (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0") + (match_operand:V16QI 2 "nonimmediate_operand" "xm") + (reg:V16QI 21)] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "pblendvb\t{%%xmm0, %2, %0|%0, %2, %%xmm0}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_pblendw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (vec_merge:V8HI + (match_operand:V8HI 2 "nonimmediate_operand" "xm") + (match_operand:V8HI 1 "register_operand" "0") + (match_operand:SI 3 "const_0_to_255_operand" "n")))] + "TARGET_SSE4_1" + "pblendw\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_phminposuw" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (unspec:V8HI [(match_operand:V8HI 1 "nonimmediate_operand" "xm")] + UNSPEC_PHMINPOSUW))] + "TARGET_SSE4_1" + "phminposuw\t{%1, %0|%0, %1}" + [(set_attr "type" "sselog1") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_extendv8qiv8hi2" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (sign_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3) + (const_int 4) + (const_int 5) + (const_int 6) + (const_int 7)]))))] + "TARGET_SSE4_1" + "pmovsxbw\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_extendv8qiv8hi2" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (sign_extend:V8HI + (vec_select:V8QI + (vec_duplicate:V16QI + (match_operand:V8QI 1 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3) + (const_int 4) + (const_int 5) + (const_int 6) + (const_int 7)]))))] + "TARGET_SSE4_1" + "pmovsxbw\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_extendv4qiv4si2" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (sign_extend:V4SI + (vec_select:V4QI + (match_operand:V16QI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3)]))))] + "TARGET_SSE4_1" + "pmovsxbd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_extendv4qiv4si2" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (sign_extend:V4SI + (vec_select:V4QI + (vec_duplicate:V16QI + (match_operand:V4QI 1 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3)]))))] + "TARGET_SSE4_1" + "pmovsxbd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_extendv2qiv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (sign_extend:V2DI + (vec_select:V2QI + (match_operand:V16QI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "pmovsxbq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_extendv2qiv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (sign_extend:V2DI + (vec_select:V2QI + (vec_duplicate:V16QI + (match_operand:V2QI 1 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "pmovsxbq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_extendv4hiv4si2" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3)]))))] + "TARGET_SSE4_1" + "pmovsxwd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_extendv4hiv4si2" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (sign_extend:V4SI + (vec_select:V4HI + (vec_duplicate:V8HI + (match_operand:V2HI 1 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3)]))))] + "TARGET_SSE4_1" + "pmovsxwd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_extendv2hiv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (sign_extend:V2DI + (vec_select:V2HI + (match_operand:V8HI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "pmovsxwq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_extendv2hiv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (sign_extend:V2DI + (vec_select:V2HI + (vec_duplicate:V8HI + (match_operand:V8HI 1 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "pmovsxwq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_extendv2siv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "pmovsxdq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_extendv2siv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (sign_extend:V2DI + (vec_select:V2SI + (vec_duplicate:V4SI + (match_operand:V2SI 1 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "pmovsxdq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_zero_extendv8qiv8hi2" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (zero_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3) + (const_int 4) + (const_int 5) + (const_int 6) + (const_int 7)]))))] + "TARGET_SSE4_1" + "pmovzxbw\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_zero_extendv8qiv8hi2" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (zero_extend:V8HI + (vec_select:V8QI + (vec_duplicate:V16QI + (match_operand:V8QI 1 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3) + (const_int 4) + (const_int 5) + (const_int 6) + (const_int 7)]))))] + "TARGET_SSE4_1" + "pmovzxbw\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_zero_extendv4qiv4si2" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (zero_extend:V4SI + (vec_select:V4QI + (match_operand:V16QI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3)]))))] + "TARGET_SSE4_1" + "pmovzxbd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_zero_extendv4qiv4si2" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (zero_extend:V4SI + (vec_select:V4QI + (vec_duplicate:V16QI + (match_operand:V4QI 1 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3)]))))] + "TARGET_SSE4_1" + "pmovzxbd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_zero_extendv2qiv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (zero_extend:V2DI + (vec_select:V2QI + (match_operand:V16QI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "pmovzxbq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_zero_extendv2qiv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (zero_extend:V2DI + (vec_select:V2QI + (vec_duplicate:V16QI + (match_operand:V2QI 1 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "pmovzxbq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_zero_extendv4hiv4si2" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (zero_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3)]))))] + "TARGET_SSE4_1" + "pmovzxwd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_zero_extendv4hiv4si2" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (zero_extend:V4SI + (vec_select:V4HI + (vec_duplicate:V8HI + (match_operand:V4HI 1 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 1) + (const_int 2) + (const_int 3)]))))] + "TARGET_SSE4_1" + "pmovzxwd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_zero_extendv2hiv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (zero_extend:V2DI + (vec_select:V2HI + (match_operand:V8HI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "pmovzxwq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_zero_extendv2hiv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (zero_extend:V2DI + (vec_select:V2HI + (vec_duplicate:V8HI + (match_operand:V2HI 1 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "pmovzxwq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_zero_extendv2siv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (zero_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "register_operand" "x") + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "pmovzxdq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_zero_extendv2siv2di2" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (zero_extend:V2DI + (vec_select:V2SI + (vec_duplicate:V4SI + (match_operand:V2SI 1 "nonimmediate_operand" "xm")) + (parallel [(const_int 0) + (const_int 1)]))))] + "TARGET_SSE4_1" + "pmovzxdq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +;; ptest is very similar to comiss and ucomiss when setting FLAGS_REG. +;; But it is not a really compare instruction. +(define_insn "sse4_1_ptest" + [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_operand:V2DI 0 "register_operand" "x") + (match_operand:V2DI 1 "nonimmediate_operand" "xm")] + UNSPEC_PTEST))] + "TARGET_SSE4_1" + "ptest\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecomi") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) + +(define_insn "sse4_1_roundpd" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (unspec:V2DF [(match_operand:V2DF 1 "nonimmediate_operand" "xm") + (match_operand:SI 2 "const_0_to_15_operand" "n")] + UNSPEC_ROUNDP))] + "TARGET_SSE4_1" + "roundpd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecvt") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2DF")]) + +(define_insn "sse4_1_roundps" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm") + (match_operand:SI 2 "const_0_to_15_operand" "n")] + UNSPEC_ROUNDP))] + "TARGET_SSE4_1" + "roundps\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "ssecvt") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V4SF")]) + +(define_insn "sse4_1_roundsd" + [(set (match_operand:V2DF 0 "register_operand" "=x") + (vec_merge:V2DF + (unspec:V2DF [(match_operand:V2DF 2 "register_operand" "x") + (match_operand:SI 3 "const_0_to_15_operand" "n")] + UNSPEC_ROUNDS) + (match_operand:V2DF 1 "register_operand" "0") + (const_int 1)))] + "TARGET_SSE4_1" + "roundsd\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssecvt") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V2DF")]) + +(define_insn "sse4_1_roundss" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (unspec:V4SF [(match_operand:V4SF 2 "register_operand" "x") + (match_operand:SI 3 "const_0_to_15_operand" "n")] + UNSPEC_ROUNDS) + (match_operand:V4SF 1 "register_operand" "0") + (const_int 1)))] + "TARGET_SSE4_1" + "roundss\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "ssecvt") + (set_attr "prefix_extra" "1") + (set_attr "mode" "V4SF")]) diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 2a310571b71..a09e4530977 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -7396,6 +7396,84 @@ v4si __builtin_ia32_pabsd128 (v4si) v8hi __builtin_ia32_pabsw128 (v8hi) @end smallexample +The following built-in functions are available when @option{-msse4.1} is +used. All of them generate the machine instruction that is part of the +name. + +@smallexample +v2df __builtin_ia32_blendpd (v2df, v2df, const int) +v4sf __builtin_ia32_blendps (v4sf, v4sf, const int) +v2df __builtin_ia32_blendvpd (v2df, v2df, v2df) +v4sf __builtin_ia32_blendvps (v4sf, v4sf, v4sf) +v2df __builtin_ia32_dppd (__v2df, __v2df, const int) +v4sf __builtin_ia32_dpps (v4sf, v4sf, const int) +v4sf __builtin_ia32_insertps128 (v4sf, v4sf, const int) +v2di __builtin_ia32_movntdqa (v2di *); +v16qi __builtin_ia32_mpsadbw128 (v16qi, v16qi, const int) +v8hi __builtin_ia32_packusdw128 (v4si, v4si) +v16qi __builtin_ia32_pblendvb128 (v16qi, v16qi, v16qi) +v8hi __builtin_ia32_pblendw128 (v8hi, v8hi, const int) +v2di __builtin_ia32_pcmpeqq (v2di, v2di) +v8hi __builtin_ia32_phminposuw128 (v8hi) +v16qi __builtin_ia32_pmaxsb128 (v16qi, v16qi) +v4si __builtin_ia32_pmaxsd128 (v4si, v4si) +v4si __builtin_ia32_pmaxud128 (v4si, v4si) +v8hi __builtin_ia32_pmaxuw128 (v8hi, v8hi) +v16qi __builtin_ia32_pminsb128 (v16qi, v16qi) +v4si __builtin_ia32_pminsd128 (v4si, v4si) +v4si __builtin_ia32_pminud128 (v4si, v4si) +v8hi __builtin_ia32_pminuw128 (v8hi, v8hi) +v4si __builtin_ia32_pmovsxbd128 (v16qi) +v2di __builtin_ia32_pmovsxbq128 (v16qi) +v8hi __builtin_ia32_pmovsxbw128 (v16qi) +v2di __builtin_ia32_pmovsxdq128 (v4si) +v4si __builtin_ia32_pmovsxwd128 (v8hi) +v2di __builtin_ia32_pmovsxwq128 (v8hi) +v4si __builtin_ia32_pmovzxbd128 (v16qi) +v2di __builtin_ia32_pmovzxbq128 (v16qi) +v8hi __builtin_ia32_pmovzxbw128 (v16qi) +v2di __builtin_ia32_pmovzxdq128 (v4si) +v4si __builtin_ia32_pmovzxwd128 (v8hi) +v2di __builtin_ia32_pmovzxwq128 (v8hi) +v2di __builtin_ia32_pmuldq128 (v4si, v4si) +v4si __builtin_ia32_pmulld128 (v4si, v4si) +int __builtin_ia32_ptestc128 (v2di, v2di) +int __builtin_ia32_ptestnzc128 (v2di, v2di) +int __builtin_ia32_ptestz128 (v2di, v2di) +v2df __builtin_ia32_roundpd (v2df, const int) +v4sf __builtin_ia32_roundps (v4sf, const int) +v2df __builtin_ia32_roundsd (v2df, v2df, const int) +v4sf __builtin_ia32_roundss (v4sf, v4sf, const int) +@end smallexample + +The following built-in functions are available when @option{-msse4.1} is +used. + +@table @code +@item v4sf __builtin_ia32_vec_set_v4sf (v4sf, float, const int) +Generates the @code{insertps} machine instruction. +@item int __builtin_ia32_vec_ext_v16qi (v16qi, const int) +Generates the @code{pextrb} machine instruction. +@item v16qi __builtin_ia32_vec_set_v16qi (v16qi, int, const int) +Generates the @code{pinsrb} machine instruction. +@item v4si __builtin_ia32_vec_set_v4si (v4si, int, const int) +Generates the @code{pinsrd} machine instruction. +@item v2di __builtin_ia32_vec_set_v2di (v2di, long long, const int) +Generates the @code{pinsrq} machine instruction in 64bit mode. +@end table + +The following built-in functions are changed to generate new SSE4.1 +instructions when @option{-msse4.1} is used. + +@table @code +@item float __builtin_ia32_vec_ext_v4sf (v4sf, const int) +Generates the @code{extractps} machine instruction. +@item int __builtin_ia32_vec_ext_v4si (v4si, const int) +Generates the @code{pextrd} machine instruction. +@item long long __builtin_ia32_vec_ext_v2di (v2di, const int) +Generates the @code{pextrq} machine instruction in 64bit mode. +@end table + The following built-in functions are available when @option{-msse4a} is used. @smallexample diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index d8260ba120b..21ef96cae7c 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -547,7 +547,8 @@ Objective-C and Objective-C++ Dialects}. -mno-fp-ret-in-387 -msoft-float @gol -mno-wide-multiply -mrtd -malign-double @gol -mpreferred-stack-boundary=@var{num} -mcx16 -msahf @gol --mmmx -msse -msse2 -msse3 -mssse3 -msse4a -m3dnow -mpopcnt -mabm @gol +-mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 @gol +-msse4a -m3dnow -mpopcnt -mabm @gol -mthreads -mno-align-stringops -minline-all-stringops @gol -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol -m96bit-long-double -mregparm=@var{num} -msseregparm @gol @@ -10260,6 +10261,8 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}. @itemx -mno-sse3 @item -mssse3 @itemx -mno-ssse3 +@item -msse4.1 +@itemx -mno-sse4.1 @item -msse4a @item -mno-sse4a @item -m3dnow @@ -10275,7 +10278,8 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}. @opindex m3dnow @opindex mno-3dnow These switches enable or disable the use of instructions in the MMX, -SSE, SSE2, SSE3, SSSE3, SSE4A, ABM or 3DNow! extended instruction sets. +SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4A, ABM or 3DNow! extended +instruction sets. These extensions are also available as built-in functions: see @ref{X86 Built-in Functions}, for details of the functions enabled and disabled by these switches.