From 99017161354321845d11dce4fcd3abfebc5dd0d5 Mon Sep 17 00:00:00 2001 From: Andrew Senkevich Date: Fri, 24 Jul 2015 14:47:23 +0300 Subject: [PATCH] Fixed several libmvec bugs found during testing on KNL hardware. AVX512 IFUNC implementations, implementations of wrappers to AVX2 versions and KNL expf implementation fixed. * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC. * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise. * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2. * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL implementation. --- ChangeLog | 19 ++ .../x86_64/fpu/multiarch/svml_d_cos8_core.S | 10 +- .../x86_64/fpu/multiarch/svml_d_exp8_core.S | 10 +- .../x86_64/fpu/multiarch/svml_d_log8_core.S | 10 +- .../x86_64/fpu/multiarch/svml_d_pow8_core.S | 10 +- .../x86_64/fpu/multiarch/svml_d_sin8_core.S | 10 +- .../fpu/multiarch/svml_d_sincos8_core.S | 10 +- .../x86_64/fpu/multiarch/svml_s_cosf16_core.S | 10 +- .../x86_64/fpu/multiarch/svml_s_expf16_core.S | 10 +- .../fpu/multiarch/svml_s_expf16_core_avx512.S | 1 + .../x86_64/fpu/multiarch/svml_s_logf16_core.S | 10 +- .../x86_64/fpu/multiarch/svml_s_powf16_core.S | 10 +- .../fpu/multiarch/svml_s_sincosf16_core.S | 10 +- .../x86_64/fpu/multiarch/svml_s_sinf16_core.S | 10 +- sysdeps/x86_64/fpu/svml_d_wrapper_impl.h | 202 +++++++----------- sysdeps/x86_64/fpu/svml_s_wrapper_impl.h | 101 +++++---- 16 files changed, 220 insertions(+), 223 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6f6016db43..3e22413e8a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +2015-07-24 Andrew Senkevich + + * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC. + * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise. + * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise. + * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise. + * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise. + * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise. + * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise. + * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise. + * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise. + * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise. + * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise. + * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise. + * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2. + * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise. + * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL + implementation. + 2015-07-24 Szabolcs Nagy [BZ #17711] diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S index ba3b66f69f..d0f4f27f46 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8v_cos) .type _ZGVeN8v_cos, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8v_cos_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8v_cos_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8v_cos_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8v_cos_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8v_cos) #define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S index 8f837fbfb9..7b7c07d926 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8v_exp) .type _ZGVeN8v_exp, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8v_exp_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8v_exp_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8v_exp_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8v_exp_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8v_exp) #define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S index 2f9e9d8892..76375fdae0 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8v_log) .type _ZGVeN8v_log, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8v_log_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8v_log_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8v_log_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8v_log_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8v_log) #define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S index 3b11511e51..c1e5e76f92 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8vv_pow) .type _ZGVeN8vv_pow, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8vv_pow_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8vv_pow_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8vv_pow_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8vv_pow) #define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S index ba631020f3..131f2f47c5 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8v_sin) .type _ZGVeN8v_sin, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8v_sin_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8v_sin_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8v_sin_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8v_sin_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8v_sin) #define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S index 7228ba549a..e33109099e 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN8vvv_sincos) .type _ZGVeN8vvv_sincos, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN8vvv_sincos_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN8vvv_sincos_knl(%rip), %rax + jnz 2f + leaq _ZGVeN8vvv_sincos_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN8vvv_sincos) #define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S index 91564de22a..0654d3c19b 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16v_cosf) .type _ZGVeN16v_cosf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16v_cosf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16v_cosf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16v_cosf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16v_cosf) #define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S index 3b3489d05a..62858eb39e 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16v_expf) .type _ZGVeN16v_expf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16v_expf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16v_expf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16v_expf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16v_expf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16v_expf) #define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S index cb807e0757..ec69055351 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S @@ -46,6 +46,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf The table lookup is skipped if k = 0. For low accuracy approximation, exp(r) ~ 1 or 1+r. */ + pushq %rbp cfi_adjust_cfa_offset (8) cfi_rel_offset (%rbp, 0) movq %rsp, %rbp diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S index 8756750c86..68c57e4386 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16v_logf) .type _ZGVeN16v_logf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16v_logf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16v_logf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16v_logf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16v_logf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16v_logf) #define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S index a4ba4fbc04..3aa9f952ce 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16vv_powf) .type _ZGVeN16vv_powf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16vv_powf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16vv_powf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16vv_powf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16vv_powf) #define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S index 0a1753eab7..bdcabab6e2 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16vvv_sincosf) .type _ZGVeN16vvv_sincosf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16vvv_sincosf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16vvv_sincosf) #define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S index 7ed637b8e6..3ec78a0b5e 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S @@ -23,16 +23,16 @@ ENTRY (_ZGVeN16v_sinf) .type _ZGVeN16v_sinf, @gnu_indirect_function cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1 + jne 1f call __init_cpu_features 1: leaq _ZGVeN16v_sinf_skx(%rip), %rax testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) - jnz 3 -2: leaq _ZGVeN16v_sinf_knl(%rip), %rax + jnz 2f + leaq _ZGVeN16v_sinf_knl(%rip), %rax testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) - jnz 3 + jnz 2f leaq _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax -3: ret +2: ret END (_ZGVeN16v_sinf) #define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h index bd93b8edfa..5c0ff897c0 100644 --- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h +++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h @@ -194,39 +194,39 @@ /* AVX512 ISA version as wrapper to AVX2 ISA version. */ .macro WRAPPER_IMPL_AVX512 callee - pushq %rbp + pushq %rbp cfi_adjust_cfa_offset (8) cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp + movq %rsp, %rbp cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $64, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovapd (%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x04 - .byte 0x24 - call HIDDEN_JUMPTARGET(\callee) -/* Below is encoding for vmovapd 32(%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x44 - .byte 0x24 - .byte 0x20 - call HIDDEN_JUMPTARGET(\callee) - movq %rbp, %rsp + andq $-64, %rsp + subq $128, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 + vmovupd (%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 64(%rsp) + vmovupd 32(%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 96(%rsp) +/* Below is encoding for vmovups 64(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x01 + movq %rbp, %rsp cfi_def_cfa_register (%rsp) - popq %rbp + popq %rbp cfi_adjust_cfa_offset (-8) cfi_restore (%rbp) ret @@ -234,61 +234,50 @@ /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ .macro WRAPPER_IMPL_AVX512_ff callee - pushq %rbp + pushq %rbp cfi_adjust_cfa_offset (8) cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp + movq %rsp, %rbp cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $128, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovaps %zmm1, 64(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x4c - .byte 0x24 -/* Below is encoding for vmovapd (%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovapd 64(%rsp), %ymm1. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x4c - .byte 0x24 - .byte 0x40 - call HIDDEN_JUMPTARGET(\callee) -/* Below is encoding for vmovapd 32(%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x44 - .byte 0x24 - .byte 0x20 -/* Below is encoding for vmovapd 96(%rsp), %ymm1. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x4c - .byte 0x24 - .byte 0x60 - call HIDDEN_JUMPTARGET(\callee) - movq %rbp, %rsp + andq $-64, %rsp + subq $192, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 +/* Below is encoding for vmovups %zmm1, 64(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4c + .byte 0x24 + .byte 0x01 + vmovupd (%rsp), %ymm0 + vmovupd 64(%rsp), %ymm1 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 128(%rsp) + vmovupd 32(%rsp), %ymm0 + vmovupd 96(%rsp), %ymm1 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 160(%rsp) +/* Below is encoding for vmovups 128(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x02 + movq %rbp, %rsp cfi_def_cfa_register (%rsp) - popq %rbp + popq %rbp cfi_adjust_cfa_offset (-8) cfi_restore (%rbp) ret @@ -310,61 +299,26 @@ cfi_rel_offset (%r13, 0) subq $176, %rsp movq %rsi, %r13 -/* Below is encoding for vmovaps %zmm0, (%rsp). */ +/* Below is encoding for vmovups %zmm0, (%rsp). */ .byte 0x62 .byte 0xf1 .byte 0x7c .byte 0x48 - .byte 0x29 + .byte 0x11 .byte 0x04 .byte 0x24 movq %rdi, %r12 -/* Below is encoding for vmovapd (%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x04 - .byte 0x24 + vmovupd (%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) -/* Below is encoding for vmovapd 32(%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x44 - .byte 0x24 - .byte 0x20 + vmovupd 32(%rsp), %ymm0 lea 64(%rsp), %rdi lea 96(%rsp), %rsi call HIDDEN_JUMPTARGET(\callee) -/* Below is encoding for vmovapd 64(%rsp), %ymm0. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x44 - .byte 0x24 - .byte 0x40 -/* Below is encoding for vmovapd 96(%rsp), %ymm1. */ - .byte 0xc5 - .byte 0xfd - .byte 0x28 - .byte 0x4c - .byte 0x24 - .byte 0x60 -/* Below is encoding for vmovapd %ymm0, 32(%r12). */ - .byte 0xc4 - .byte 0xc1 - .byte 0x7d - .byte 0x29 - .byte 0x44 - .byte 0x24 - .byte 0x20 -/* Below is encoding for vmovapd %ymm1, 32(%r13). */ - .byte 0xc4 - .byte 0xc1 - .byte 0x7d - .byte 0x29 - .byte 0x4d - .byte 0x20 + vmovupd 64(%rsp), %ymm0 + vmovupd 96(%rsp), %ymm1 + vmovupd %ymm0, 32(%r12) + vmovupd %ymm1, 32(%r13) + vzeroupper addq $176, %rsp popq %r13 cfi_adjust_cfa_offset (-8) diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h index 66bb081c9d..d255d195ee 100644 --- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h +++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h @@ -239,28 +239,39 @@ /* AVX512 ISA version as wrapper to AVX2 ISA version. */ .macro WRAPPER_IMPL_AVX512 callee - pushq %rbp + pushq %rbp cfi_adjust_cfa_offset (8) cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp + movq %rsp, %rbp cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $64, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 - vmovaps (%rsp), %ymm0 - call HIDDEN_JUMPTARGET(\callee) - vmovaps 32(%rsp), %ymm0 - call HIDDEN_JUMPTARGET(\callee) - movq %rbp, %rsp + andq $-64, %rsp + subq $128, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 + vmovupd (%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 64(%rsp) + vmovupd 32(%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 96(%rsp) +/* Below is encoding for vmovups 64(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x01 + movq %rbp, %rsp cfi_def_cfa_register (%rsp) - popq %rbp + popq %rbp cfi_adjust_cfa_offset (-8) cfi_restore (%rbp) ret @@ -274,29 +285,41 @@ movq %rsp, %rbp cfi_def_cfa_register (%rbp) andq $-64, %rsp - subq $128, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovaps %zmm1, 64(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x4c - .byte 0x24 - vmovaps (%rsp), %ymm0 - vmovaps 64(%rsp), %ymm1 + subq $192, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 +/* Below is encoding for vmovups %zmm1, 64(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4c + .byte 0x24 + .byte 0x01 + vmovups (%rsp), %ymm0 + vmovups 64(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) - vmovaps 32(%rsp), %ymm0 - vmovaps 96(%rsp), %ymm1 + vmovups %ymm0, 128(%rsp) + vmovups 32(%rsp), %ymm0 + vmovups 96(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) + vmovups %ymm0, 160(%rsp) +/* Below is encoding for vmovups 128(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x02 movq %rbp, %rsp cfi_def_cfa_register (%rsp) popq %rbp