Fixed several libmvec bugs found during testing on KNL hardware.
AVX512 IFUNC implementations, implementations of wrappers to AVX2 versions and KNL expf implementation fixed. * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC. * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise. * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2. * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL implementation.
This commit is contained in:
parent
3bcea719dd
commit
9901716135
19
ChangeLog
19
ChangeLog
|
@ -1,3 +1,22 @@
|
||||||
|
2015-07-24 Andrew Senkevich <andrew.senkevich@intel.com>
|
||||||
|
|
||||||
|
* sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC.
|
||||||
|
* sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise.
|
||||||
|
* sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise.
|
||||||
|
* sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise.
|
||||||
|
* sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise.
|
||||||
|
* sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise.
|
||||||
|
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise.
|
||||||
|
* sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise.
|
||||||
|
* sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise.
|
||||||
|
* sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise.
|
||||||
|
* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise.
|
||||||
|
* sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise.
|
||||||
|
* sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2.
|
||||||
|
* sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
|
||||||
|
* sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL
|
||||||
|
implementation.
|
||||||
|
|
||||||
2015-07-24 Szabolcs Nagy <szabolcs.nagy@arm.com>
|
2015-07-24 Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||||
|
|
||||||
[BZ #17711]
|
[BZ #17711]
|
||||||
|
|
|
@ -23,16 +23,16 @@
|
||||||
ENTRY (_ZGVeN8v_cos)
|
ENTRY (_ZGVeN8v_cos)
|
||||||
.type _ZGVeN8v_cos, @gnu_indirect_function
|
.type _ZGVeN8v_cos, @gnu_indirect_function
|
||||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||||
jne 1
|
jne 1f
|
||||||
call __init_cpu_features
|
call __init_cpu_features
|
||||||
1: leaq _ZGVeN8v_cos_skx(%rip), %rax
|
1: leaq _ZGVeN8v_cos_skx(%rip), %rax
|
||||||
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
2: leaq _ZGVeN8v_cos_knl(%rip), %rax
|
leaq _ZGVeN8v_cos_knl(%rip), %rax
|
||||||
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
leaq _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
|
leaq _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
|
||||||
3: ret
|
2: ret
|
||||||
END (_ZGVeN8v_cos)
|
END (_ZGVeN8v_cos)
|
||||||
|
|
||||||
#define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper
|
#define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper
|
||||||
|
|
|
@ -23,16 +23,16 @@
|
||||||
ENTRY (_ZGVeN8v_exp)
|
ENTRY (_ZGVeN8v_exp)
|
||||||
.type _ZGVeN8v_exp, @gnu_indirect_function
|
.type _ZGVeN8v_exp, @gnu_indirect_function
|
||||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||||
jne 1
|
jne 1f
|
||||||
call __init_cpu_features
|
call __init_cpu_features
|
||||||
1: leaq _ZGVeN8v_exp_skx(%rip), %rax
|
1: leaq _ZGVeN8v_exp_skx(%rip), %rax
|
||||||
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
2: leaq _ZGVeN8v_exp_knl(%rip), %rax
|
leaq _ZGVeN8v_exp_knl(%rip), %rax
|
||||||
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
leaq _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
|
leaq _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
|
||||||
3: ret
|
2: ret
|
||||||
END (_ZGVeN8v_exp)
|
END (_ZGVeN8v_exp)
|
||||||
|
|
||||||
#define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper
|
#define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper
|
||||||
|
|
|
@ -23,16 +23,16 @@
|
||||||
ENTRY (_ZGVeN8v_log)
|
ENTRY (_ZGVeN8v_log)
|
||||||
.type _ZGVeN8v_log, @gnu_indirect_function
|
.type _ZGVeN8v_log, @gnu_indirect_function
|
||||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||||
jne 1
|
jne 1f
|
||||||
call __init_cpu_features
|
call __init_cpu_features
|
||||||
1: leaq _ZGVeN8v_log_skx(%rip), %rax
|
1: leaq _ZGVeN8v_log_skx(%rip), %rax
|
||||||
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
2: leaq _ZGVeN8v_log_knl(%rip), %rax
|
leaq _ZGVeN8v_log_knl(%rip), %rax
|
||||||
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
leaq _ZGVeN8v_log_avx2_wrapper(%rip), %rax
|
leaq _ZGVeN8v_log_avx2_wrapper(%rip), %rax
|
||||||
3: ret
|
2: ret
|
||||||
END (_ZGVeN8v_log)
|
END (_ZGVeN8v_log)
|
||||||
|
|
||||||
#define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper
|
#define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper
|
||||||
|
|
|
@ -23,16 +23,16 @@
|
||||||
ENTRY (_ZGVeN8vv_pow)
|
ENTRY (_ZGVeN8vv_pow)
|
||||||
.type _ZGVeN8vv_pow, @gnu_indirect_function
|
.type _ZGVeN8vv_pow, @gnu_indirect_function
|
||||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||||
jne 1
|
jne 1f
|
||||||
call __init_cpu_features
|
call __init_cpu_features
|
||||||
1: leaq _ZGVeN8vv_pow_skx(%rip), %rax
|
1: leaq _ZGVeN8vv_pow_skx(%rip), %rax
|
||||||
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
2: leaq _ZGVeN8vv_pow_knl(%rip), %rax
|
leaq _ZGVeN8vv_pow_knl(%rip), %rax
|
||||||
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
leaq _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
|
leaq _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
|
||||||
3: ret
|
2: ret
|
||||||
END (_ZGVeN8vv_pow)
|
END (_ZGVeN8vv_pow)
|
||||||
|
|
||||||
#define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper
|
#define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper
|
||||||
|
|
|
@ -23,16 +23,16 @@
|
||||||
ENTRY (_ZGVeN8v_sin)
|
ENTRY (_ZGVeN8v_sin)
|
||||||
.type _ZGVeN8v_sin, @gnu_indirect_function
|
.type _ZGVeN8v_sin, @gnu_indirect_function
|
||||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||||
jne 1
|
jne 1f
|
||||||
call __init_cpu_features
|
call __init_cpu_features
|
||||||
1: leaq _ZGVeN8v_sin_skx(%rip), %rax
|
1: leaq _ZGVeN8v_sin_skx(%rip), %rax
|
||||||
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
2: leaq _ZGVeN8v_sin_knl(%rip), %rax
|
leaq _ZGVeN8v_sin_knl(%rip), %rax
|
||||||
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
leaq _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
|
leaq _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
|
||||||
3: ret
|
2: ret
|
||||||
END (_ZGVeN8v_sin)
|
END (_ZGVeN8v_sin)
|
||||||
|
|
||||||
#define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper
|
#define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper
|
||||||
|
|
|
@ -23,16 +23,16 @@
|
||||||
ENTRY (_ZGVeN8vvv_sincos)
|
ENTRY (_ZGVeN8vvv_sincos)
|
||||||
.type _ZGVeN8vvv_sincos, @gnu_indirect_function
|
.type _ZGVeN8vvv_sincos, @gnu_indirect_function
|
||||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||||
jne 1
|
jne 1f
|
||||||
call __init_cpu_features
|
call __init_cpu_features
|
||||||
1: leaq _ZGVeN8vvv_sincos_skx(%rip), %rax
|
1: leaq _ZGVeN8vvv_sincos_skx(%rip), %rax
|
||||||
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
2: leaq _ZGVeN8vvv_sincos_knl(%rip), %rax
|
leaq _ZGVeN8vvv_sincos_knl(%rip), %rax
|
||||||
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
leaq _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
|
leaq _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
|
||||||
3: ret
|
2: ret
|
||||||
END (_ZGVeN8vvv_sincos)
|
END (_ZGVeN8vvv_sincos)
|
||||||
|
|
||||||
#define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper
|
#define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper
|
||||||
|
|
|
@ -23,16 +23,16 @@
|
||||||
ENTRY (_ZGVeN16v_cosf)
|
ENTRY (_ZGVeN16v_cosf)
|
||||||
.type _ZGVeN16v_cosf, @gnu_indirect_function
|
.type _ZGVeN16v_cosf, @gnu_indirect_function
|
||||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||||
jne 1
|
jne 1f
|
||||||
call __init_cpu_features
|
call __init_cpu_features
|
||||||
1: leaq _ZGVeN16v_cosf_skx(%rip), %rax
|
1: leaq _ZGVeN16v_cosf_skx(%rip), %rax
|
||||||
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
2: leaq _ZGVeN16v_cosf_knl(%rip), %rax
|
leaq _ZGVeN16v_cosf_knl(%rip), %rax
|
||||||
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
|
leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
|
||||||
3: ret
|
2: ret
|
||||||
END (_ZGVeN16v_cosf)
|
END (_ZGVeN16v_cosf)
|
||||||
|
|
||||||
#define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper
|
#define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper
|
||||||
|
|
|
@ -23,16 +23,16 @@
|
||||||
ENTRY (_ZGVeN16v_expf)
|
ENTRY (_ZGVeN16v_expf)
|
||||||
.type _ZGVeN16v_expf, @gnu_indirect_function
|
.type _ZGVeN16v_expf, @gnu_indirect_function
|
||||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||||
jne 1
|
jne 1f
|
||||||
call __init_cpu_features
|
call __init_cpu_features
|
||||||
1: leaq _ZGVeN16v_expf_skx(%rip), %rax
|
1: leaq _ZGVeN16v_expf_skx(%rip), %rax
|
||||||
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
2: leaq _ZGVeN16v_expf_knl(%rip), %rax
|
leaq _ZGVeN16v_expf_knl(%rip), %rax
|
||||||
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
leaq _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
|
leaq _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
|
||||||
3: ret
|
2: ret
|
||||||
END (_ZGVeN16v_expf)
|
END (_ZGVeN16v_expf)
|
||||||
|
|
||||||
#define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper
|
#define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper
|
||||||
|
|
|
@ -46,6 +46,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
|
||||||
The table lookup is skipped if k = 0.
|
The table lookup is skipped if k = 0.
|
||||||
For low accuracy approximation, exp(r) ~ 1 or 1+r. */
|
For low accuracy approximation, exp(r) ~ 1 or 1+r. */
|
||||||
|
|
||||||
|
pushq %rbp
|
||||||
cfi_adjust_cfa_offset (8)
|
cfi_adjust_cfa_offset (8)
|
||||||
cfi_rel_offset (%rbp, 0)
|
cfi_rel_offset (%rbp, 0)
|
||||||
movq %rsp, %rbp
|
movq %rsp, %rbp
|
||||||
|
|
|
@ -23,16 +23,16 @@
|
||||||
ENTRY (_ZGVeN16v_logf)
|
ENTRY (_ZGVeN16v_logf)
|
||||||
.type _ZGVeN16v_logf, @gnu_indirect_function
|
.type _ZGVeN16v_logf, @gnu_indirect_function
|
||||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||||
jne 1
|
jne 1f
|
||||||
call __init_cpu_features
|
call __init_cpu_features
|
||||||
1: leaq _ZGVeN16v_logf_skx(%rip), %rax
|
1: leaq _ZGVeN16v_logf_skx(%rip), %rax
|
||||||
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
2: leaq _ZGVeN16v_logf_knl(%rip), %rax
|
leaq _ZGVeN16v_logf_knl(%rip), %rax
|
||||||
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
leaq _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
|
leaq _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
|
||||||
3: ret
|
2: ret
|
||||||
END (_ZGVeN16v_logf)
|
END (_ZGVeN16v_logf)
|
||||||
|
|
||||||
#define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper
|
#define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper
|
||||||
|
|
|
@ -23,16 +23,16 @@
|
||||||
ENTRY (_ZGVeN16vv_powf)
|
ENTRY (_ZGVeN16vv_powf)
|
||||||
.type _ZGVeN16vv_powf, @gnu_indirect_function
|
.type _ZGVeN16vv_powf, @gnu_indirect_function
|
||||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||||
jne 1
|
jne 1f
|
||||||
call __init_cpu_features
|
call __init_cpu_features
|
||||||
1: leaq _ZGVeN16vv_powf_skx(%rip), %rax
|
1: leaq _ZGVeN16vv_powf_skx(%rip), %rax
|
||||||
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
2: leaq _ZGVeN16vv_powf_knl(%rip), %rax
|
leaq _ZGVeN16vv_powf_knl(%rip), %rax
|
||||||
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
leaq _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
|
leaq _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
|
||||||
3: ret
|
2: ret
|
||||||
END (_ZGVeN16vv_powf)
|
END (_ZGVeN16vv_powf)
|
||||||
|
|
||||||
#define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper
|
#define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper
|
||||||
|
|
|
@ -23,16 +23,16 @@
|
||||||
ENTRY (_ZGVeN16vvv_sincosf)
|
ENTRY (_ZGVeN16vvv_sincosf)
|
||||||
.type _ZGVeN16vvv_sincosf, @gnu_indirect_function
|
.type _ZGVeN16vvv_sincosf, @gnu_indirect_function
|
||||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||||
jne 1
|
jne 1f
|
||||||
call __init_cpu_features
|
call __init_cpu_features
|
||||||
1: leaq _ZGVeN16vvv_sincosf_skx(%rip), %rax
|
1: leaq _ZGVeN16vvv_sincosf_skx(%rip), %rax
|
||||||
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
2: leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax
|
leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax
|
||||||
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
leaq _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
|
leaq _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
|
||||||
3: ret
|
2: ret
|
||||||
END (_ZGVeN16vvv_sincosf)
|
END (_ZGVeN16vvv_sincosf)
|
||||||
|
|
||||||
#define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper
|
#define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper
|
||||||
|
|
|
@ -23,16 +23,16 @@
|
||||||
ENTRY (_ZGVeN16v_sinf)
|
ENTRY (_ZGVeN16v_sinf)
|
||||||
.type _ZGVeN16v_sinf, @gnu_indirect_function
|
.type _ZGVeN16v_sinf, @gnu_indirect_function
|
||||||
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
|
||||||
jne 1
|
jne 1f
|
||||||
call __init_cpu_features
|
call __init_cpu_features
|
||||||
1: leaq _ZGVeN16v_sinf_skx(%rip), %rax
|
1: leaq _ZGVeN16v_sinf_skx(%rip), %rax
|
||||||
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
2: leaq _ZGVeN16v_sinf_knl(%rip), %rax
|
leaq _ZGVeN16v_sinf_knl(%rip), %rax
|
||||||
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
|
||||||
jnz 3
|
jnz 2f
|
||||||
leaq _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
|
leaq _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
|
||||||
3: ret
|
2: ret
|
||||||
END (_ZGVeN16v_sinf)
|
END (_ZGVeN16v_sinf)
|
||||||
|
|
||||||
#define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper
|
#define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper
|
||||||
|
|
|
@ -194,39 +194,39 @@
|
||||||
|
|
||||||
/* AVX512 ISA version as wrapper to AVX2 ISA version. */
|
/* AVX512 ISA version as wrapper to AVX2 ISA version. */
|
||||||
.macro WRAPPER_IMPL_AVX512 callee
|
.macro WRAPPER_IMPL_AVX512 callee
|
||||||
pushq %rbp
|
pushq %rbp
|
||||||
cfi_adjust_cfa_offset (8)
|
cfi_adjust_cfa_offset (8)
|
||||||
cfi_rel_offset (%rbp, 0)
|
cfi_rel_offset (%rbp, 0)
|
||||||
movq %rsp, %rbp
|
movq %rsp, %rbp
|
||||||
cfi_def_cfa_register (%rbp)
|
cfi_def_cfa_register (%rbp)
|
||||||
andq $-64, %rsp
|
andq $-64, %rsp
|
||||||
subq $64, %rsp
|
subq $128, %rsp
|
||||||
/* Below is encoding for vmovaps %zmm0, (%rsp). */
|
/* Below is encoding for vmovups %zmm0, (%rsp). */
|
||||||
.byte 0x62
|
.byte 0x62
|
||||||
.byte 0xf1
|
.byte 0xf1
|
||||||
.byte 0x7c
|
.byte 0x7c
|
||||||
.byte 0x48
|
.byte 0x48
|
||||||
.byte 0x29
|
.byte 0x11
|
||||||
.byte 0x04
|
.byte 0x04
|
||||||
.byte 0x24
|
.byte 0x24
|
||||||
/* Below is encoding for vmovapd (%rsp), %ymm0. */
|
vmovupd (%rsp), %ymm0
|
||||||
.byte 0xc5
|
call HIDDEN_JUMPTARGET(\callee)
|
||||||
.byte 0xfd
|
vmovupd %ymm0, 64(%rsp)
|
||||||
.byte 0x28
|
vmovupd 32(%rsp), %ymm0
|
||||||
.byte 0x04
|
call HIDDEN_JUMPTARGET(\callee)
|
||||||
.byte 0x24
|
vmovupd %ymm0, 96(%rsp)
|
||||||
call HIDDEN_JUMPTARGET(\callee)
|
/* Below is encoding for vmovups 64(%rsp), %zmm0. */
|
||||||
/* Below is encoding for vmovapd 32(%rsp), %ymm0. */
|
.byte 0x62
|
||||||
.byte 0xc5
|
.byte 0xf1
|
||||||
.byte 0xfd
|
.byte 0x7c
|
||||||
.byte 0x28
|
.byte 0x48
|
||||||
.byte 0x44
|
.byte 0x10
|
||||||
.byte 0x24
|
.byte 0x44
|
||||||
.byte 0x20
|
.byte 0x24
|
||||||
call HIDDEN_JUMPTARGET(\callee)
|
.byte 0x01
|
||||||
movq %rbp, %rsp
|
movq %rbp, %rsp
|
||||||
cfi_def_cfa_register (%rsp)
|
cfi_def_cfa_register (%rsp)
|
||||||
popq %rbp
|
popq %rbp
|
||||||
cfi_adjust_cfa_offset (-8)
|
cfi_adjust_cfa_offset (-8)
|
||||||
cfi_restore (%rbp)
|
cfi_restore (%rbp)
|
||||||
ret
|
ret
|
||||||
|
@ -234,61 +234,50 @@
|
||||||
|
|
||||||
/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
|
/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
|
||||||
.macro WRAPPER_IMPL_AVX512_ff callee
|
.macro WRAPPER_IMPL_AVX512_ff callee
|
||||||
pushq %rbp
|
pushq %rbp
|
||||||
cfi_adjust_cfa_offset (8)
|
cfi_adjust_cfa_offset (8)
|
||||||
cfi_rel_offset (%rbp, 0)
|
cfi_rel_offset (%rbp, 0)
|
||||||
movq %rsp, %rbp
|
movq %rsp, %rbp
|
||||||
cfi_def_cfa_register (%rbp)
|
cfi_def_cfa_register (%rbp)
|
||||||
andq $-64, %rsp
|
andq $-64, %rsp
|
||||||
subq $128, %rsp
|
subq $192, %rsp
|
||||||
/* Below is encoding for vmovaps %zmm0, (%rsp). */
|
/* Below is encoding for vmovups %zmm0, (%rsp). */
|
||||||
.byte 0x62
|
.byte 0x62
|
||||||
.byte 0xf1
|
.byte 0xf1
|
||||||
.byte 0x7c
|
.byte 0x7c
|
||||||
.byte 0x48
|
.byte 0x48
|
||||||
.byte 0x29
|
.byte 0x11
|
||||||
.byte 0x04
|
.byte 0x04
|
||||||
.byte 0x24
|
.byte 0x24
|
||||||
/* Below is encoding for vmovaps %zmm1, 64(%rsp). */
|
/* Below is encoding for vmovups %zmm1, 64(%rsp). */
|
||||||
.byte 0x62
|
.byte 0x62
|
||||||
.byte 0xf1
|
.byte 0xf1
|
||||||
.byte 0x7c
|
.byte 0x7c
|
||||||
.byte 0x48
|
.byte 0x48
|
||||||
.byte 0x29
|
.byte 0x11
|
||||||
.byte 0x4c
|
.byte 0x4c
|
||||||
.byte 0x24
|
.byte 0x24
|
||||||
/* Below is encoding for vmovapd (%rsp), %ymm0. */
|
.byte 0x01
|
||||||
.byte 0xc5
|
vmovupd (%rsp), %ymm0
|
||||||
.byte 0xfd
|
vmovupd 64(%rsp), %ymm1
|
||||||
.byte 0x28
|
call HIDDEN_JUMPTARGET(\callee)
|
||||||
.byte 0x04
|
vmovupd %ymm0, 128(%rsp)
|
||||||
.byte 0x24
|
vmovupd 32(%rsp), %ymm0
|
||||||
/* Below is encoding for vmovapd 64(%rsp), %ymm1. */
|
vmovupd 96(%rsp), %ymm1
|
||||||
.byte 0xc5
|
call HIDDEN_JUMPTARGET(\callee)
|
||||||
.byte 0xfd
|
vmovupd %ymm0, 160(%rsp)
|
||||||
.byte 0x28
|
/* Below is encoding for vmovups 128(%rsp), %zmm0. */
|
||||||
.byte 0x4c
|
.byte 0x62
|
||||||
.byte 0x24
|
.byte 0xf1
|
||||||
.byte 0x40
|
.byte 0x7c
|
||||||
call HIDDEN_JUMPTARGET(\callee)
|
.byte 0x48
|
||||||
/* Below is encoding for vmovapd 32(%rsp), %ymm0. */
|
.byte 0x10
|
||||||
.byte 0xc5
|
.byte 0x44
|
||||||
.byte 0xfd
|
.byte 0x24
|
||||||
.byte 0x28
|
.byte 0x02
|
||||||
.byte 0x44
|
movq %rbp, %rsp
|
||||||
.byte 0x24
|
|
||||||
.byte 0x20
|
|
||||||
/* Below is encoding for vmovapd 96(%rsp), %ymm1. */
|
|
||||||
.byte 0xc5
|
|
||||||
.byte 0xfd
|
|
||||||
.byte 0x28
|
|
||||||
.byte 0x4c
|
|
||||||
.byte 0x24
|
|
||||||
.byte 0x60
|
|
||||||
call HIDDEN_JUMPTARGET(\callee)
|
|
||||||
movq %rbp, %rsp
|
|
||||||
cfi_def_cfa_register (%rsp)
|
cfi_def_cfa_register (%rsp)
|
||||||
popq %rbp
|
popq %rbp
|
||||||
cfi_adjust_cfa_offset (-8)
|
cfi_adjust_cfa_offset (-8)
|
||||||
cfi_restore (%rbp)
|
cfi_restore (%rbp)
|
||||||
ret
|
ret
|
||||||
|
@ -310,61 +299,26 @@
|
||||||
cfi_rel_offset (%r13, 0)
|
cfi_rel_offset (%r13, 0)
|
||||||
subq $176, %rsp
|
subq $176, %rsp
|
||||||
movq %rsi, %r13
|
movq %rsi, %r13
|
||||||
/* Below is encoding for vmovaps %zmm0, (%rsp). */
|
/* Below is encoding for vmovups %zmm0, (%rsp). */
|
||||||
.byte 0x62
|
.byte 0x62
|
||||||
.byte 0xf1
|
.byte 0xf1
|
||||||
.byte 0x7c
|
.byte 0x7c
|
||||||
.byte 0x48
|
.byte 0x48
|
||||||
.byte 0x29
|
.byte 0x11
|
||||||
.byte 0x04
|
.byte 0x04
|
||||||
.byte 0x24
|
.byte 0x24
|
||||||
movq %rdi, %r12
|
movq %rdi, %r12
|
||||||
/* Below is encoding for vmovapd (%rsp), %ymm0. */
|
vmovupd (%rsp), %ymm0
|
||||||
.byte 0xc5
|
|
||||||
.byte 0xfd
|
|
||||||
.byte 0x28
|
|
||||||
.byte 0x04
|
|
||||||
.byte 0x24
|
|
||||||
call HIDDEN_JUMPTARGET(\callee)
|
call HIDDEN_JUMPTARGET(\callee)
|
||||||
/* Below is encoding for vmovapd 32(%rsp), %ymm0. */
|
vmovupd 32(%rsp), %ymm0
|
||||||
.byte 0xc5
|
|
||||||
.byte 0xfd
|
|
||||||
.byte 0x28
|
|
||||||
.byte 0x44
|
|
||||||
.byte 0x24
|
|
||||||
.byte 0x20
|
|
||||||
lea 64(%rsp), %rdi
|
lea 64(%rsp), %rdi
|
||||||
lea 96(%rsp), %rsi
|
lea 96(%rsp), %rsi
|
||||||
call HIDDEN_JUMPTARGET(\callee)
|
call HIDDEN_JUMPTARGET(\callee)
|
||||||
/* Below is encoding for vmovapd 64(%rsp), %ymm0. */
|
vmovupd 64(%rsp), %ymm0
|
||||||
.byte 0xc5
|
vmovupd 96(%rsp), %ymm1
|
||||||
.byte 0xfd
|
vmovupd %ymm0, 32(%r12)
|
||||||
.byte 0x28
|
vmovupd %ymm1, 32(%r13)
|
||||||
.byte 0x44
|
vzeroupper
|
||||||
.byte 0x24
|
|
||||||
.byte 0x40
|
|
||||||
/* Below is encoding for vmovapd 96(%rsp), %ymm1. */
|
|
||||||
.byte 0xc5
|
|
||||||
.byte 0xfd
|
|
||||||
.byte 0x28
|
|
||||||
.byte 0x4c
|
|
||||||
.byte 0x24
|
|
||||||
.byte 0x60
|
|
||||||
/* Below is encoding for vmovapd %ymm0, 32(%r12). */
|
|
||||||
.byte 0xc4
|
|
||||||
.byte 0xc1
|
|
||||||
.byte 0x7d
|
|
||||||
.byte 0x29
|
|
||||||
.byte 0x44
|
|
||||||
.byte 0x24
|
|
||||||
.byte 0x20
|
|
||||||
/* Below is encoding for vmovapd %ymm1, 32(%r13). */
|
|
||||||
.byte 0xc4
|
|
||||||
.byte 0xc1
|
|
||||||
.byte 0x7d
|
|
||||||
.byte 0x29
|
|
||||||
.byte 0x4d
|
|
||||||
.byte 0x20
|
|
||||||
addq $176, %rsp
|
addq $176, %rsp
|
||||||
popq %r13
|
popq %r13
|
||||||
cfi_adjust_cfa_offset (-8)
|
cfi_adjust_cfa_offset (-8)
|
||||||
|
|
|
@ -239,28 +239,39 @@
|
||||||
|
|
||||||
/* AVX512 ISA version as wrapper to AVX2 ISA version. */
|
/* AVX512 ISA version as wrapper to AVX2 ISA version. */
|
||||||
.macro WRAPPER_IMPL_AVX512 callee
|
.macro WRAPPER_IMPL_AVX512 callee
|
||||||
pushq %rbp
|
pushq %rbp
|
||||||
cfi_adjust_cfa_offset (8)
|
cfi_adjust_cfa_offset (8)
|
||||||
cfi_rel_offset (%rbp, 0)
|
cfi_rel_offset (%rbp, 0)
|
||||||
movq %rsp, %rbp
|
movq %rsp, %rbp
|
||||||
cfi_def_cfa_register (%rbp)
|
cfi_def_cfa_register (%rbp)
|
||||||
andq $-64, %rsp
|
andq $-64, %rsp
|
||||||
subq $64, %rsp
|
subq $128, %rsp
|
||||||
/* Below is encoding for vmovaps %zmm0, (%rsp). */
|
/* Below is encoding for vmovups %zmm0, (%rsp). */
|
||||||
.byte 0x62
|
.byte 0x62
|
||||||
.byte 0xf1
|
.byte 0xf1
|
||||||
.byte 0x7c
|
.byte 0x7c
|
||||||
.byte 0x48
|
.byte 0x48
|
||||||
.byte 0x29
|
.byte 0x11
|
||||||
.byte 0x04
|
.byte 0x04
|
||||||
.byte 0x24
|
.byte 0x24
|
||||||
vmovaps (%rsp), %ymm0
|
vmovupd (%rsp), %ymm0
|
||||||
call HIDDEN_JUMPTARGET(\callee)
|
call HIDDEN_JUMPTARGET(\callee)
|
||||||
vmovaps 32(%rsp), %ymm0
|
vmovupd %ymm0, 64(%rsp)
|
||||||
call HIDDEN_JUMPTARGET(\callee)
|
vmovupd 32(%rsp), %ymm0
|
||||||
movq %rbp, %rsp
|
call HIDDEN_JUMPTARGET(\callee)
|
||||||
|
vmovupd %ymm0, 96(%rsp)
|
||||||
|
/* Below is encoding for vmovups 64(%rsp), %zmm0. */
|
||||||
|
.byte 0x62
|
||||||
|
.byte 0xf1
|
||||||
|
.byte 0x7c
|
||||||
|
.byte 0x48
|
||||||
|
.byte 0x10
|
||||||
|
.byte 0x44
|
||||||
|
.byte 0x24
|
||||||
|
.byte 0x01
|
||||||
|
movq %rbp, %rsp
|
||||||
cfi_def_cfa_register (%rsp)
|
cfi_def_cfa_register (%rsp)
|
||||||
popq %rbp
|
popq %rbp
|
||||||
cfi_adjust_cfa_offset (-8)
|
cfi_adjust_cfa_offset (-8)
|
||||||
cfi_restore (%rbp)
|
cfi_restore (%rbp)
|
||||||
ret
|
ret
|
||||||
|
@ -274,29 +285,41 @@
|
||||||
movq %rsp, %rbp
|
movq %rsp, %rbp
|
||||||
cfi_def_cfa_register (%rbp)
|
cfi_def_cfa_register (%rbp)
|
||||||
andq $-64, %rsp
|
andq $-64, %rsp
|
||||||
subq $128, %rsp
|
subq $192, %rsp
|
||||||
/* Below is encoding for vmovaps %zmm0, (%rsp). */
|
/* Below is encoding for vmovups %zmm0, (%rsp). */
|
||||||
.byte 0x62
|
.byte 0x62
|
||||||
.byte 0xf1
|
.byte 0xf1
|
||||||
.byte 0x7c
|
.byte 0x7c
|
||||||
.byte 0x48
|
.byte 0x48
|
||||||
.byte 0x29
|
.byte 0x11
|
||||||
.byte 0x04
|
.byte 0x04
|
||||||
.byte 0x24
|
.byte 0x24
|
||||||
/* Below is encoding for vmovaps %zmm1, 64(%rsp). */
|
/* Below is encoding for vmovups %zmm1, 64(%rsp). */
|
||||||
.byte 0x62
|
.byte 0x62
|
||||||
.byte 0xf1
|
.byte 0xf1
|
||||||
.byte 0x7c
|
.byte 0x7c
|
||||||
.byte 0x48
|
.byte 0x48
|
||||||
.byte 0x29
|
.byte 0x11
|
||||||
.byte 0x4c
|
.byte 0x4c
|
||||||
.byte 0x24
|
.byte 0x24
|
||||||
vmovaps (%rsp), %ymm0
|
.byte 0x01
|
||||||
vmovaps 64(%rsp), %ymm1
|
vmovups (%rsp), %ymm0
|
||||||
|
vmovups 64(%rsp), %ymm1
|
||||||
call HIDDEN_JUMPTARGET(\callee)
|
call HIDDEN_JUMPTARGET(\callee)
|
||||||
vmovaps 32(%rsp), %ymm0
|
vmovups %ymm0, 128(%rsp)
|
||||||
vmovaps 96(%rsp), %ymm1
|
vmovups 32(%rsp), %ymm0
|
||||||
|
vmovups 96(%rsp), %ymm1
|
||||||
call HIDDEN_JUMPTARGET(\callee)
|
call HIDDEN_JUMPTARGET(\callee)
|
||||||
|
vmovups %ymm0, 160(%rsp)
|
||||||
|
/* Below is encoding for vmovups 128(%rsp), %zmm0. */
|
||||||
|
.byte 0x62
|
||||||
|
.byte 0xf1
|
||||||
|
.byte 0x7c
|
||||||
|
.byte 0x48
|
||||||
|
.byte 0x10
|
||||||
|
.byte 0x44
|
||||||
|
.byte 0x24
|
||||||
|
.byte 0x02
|
||||||
movq %rbp, %rsp
|
movq %rbp, %rsp
|
||||||
cfi_def_cfa_register (%rsp)
|
cfi_def_cfa_register (%rsp)
|
||||||
popq %rbp
|
popq %rbp
|
||||||
|
|
Loading…
Reference in New Issue