AVX512FP16: Add reduce operators(add/mul/min/max).

gcc/ChangeLog:

	* config/i386/avx512fp16intrin.h (_MM512_REDUCE_OP): New macro
	(_mm512_reduce_add_ph): New intrinsic.
	(_mm512_reduce_mul_ph): Ditto.
	(_mm512_reduce_min_ph): Ditto.
	(_mm512_reduce_max_ph): Ditto.
	* config/i386/avx512fp16vlintrin.h
	(_MM256_REDUCE_OP/_MM_REDUCE_OP): New macro.
	(_mm256_reduce_add_ph): New intrinsic.
	(_mm256_reduce_mul_ph): Ditto.
	(_mm256_reduce_min_ph): Ditto.
	(_mm256_reduce_max_ph): Ditto.
	(_mm_reduce_add_ph): Ditto.
	(_mm_reduce_mul_ph): Ditto.
	(_mm_reduce_min_ph): Ditto.
	(_mm_reduce_max_ph): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512fp16-reduce-op-1.c: New test.
	* gcc.target/i386/avx512fp16vl-reduce-op-1.c: Ditto.
This commit is contained in:
dianhong xu 2021-06-21 16:11:23 +08:00 committed by liuhongt
parent 6185b9a93c
commit f6afc926dc
4 changed files with 579 additions and 0 deletions

View File

@ -7086,6 +7086,104 @@ _mm_maskz_fmul_round_sch (__mmask8 __A, __m128h __B, __m128h __C, const int __E)
#endif /* __OPTIMIZE__ */
#define _MM512_REDUCE_OP(op) \
__m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0); \
__m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1); \
__m256h __T3 = (__T1 op __T2); \
__m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0); \
__m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1); \
__m128h __T6 = (__T4 op __T5); \
__m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6, \
(__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \
__m128h __T8 = (__T6 op __T7); \
__m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8, \
(__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 }); \
__m128h __T10 = __T8 op __T9; \
return __T10[0] op __T10[1]
// TODO reduce
extern __inline _Float16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_add_ph (__m512h __A)
{
_MM512_REDUCE_OP (+);
}
extern __inline _Float16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_mul_ph (__m512h __A)
{
_MM512_REDUCE_OP (*);
}
#undef _MM512_REDUCE_OP
#ifdef __AVX512VL__
#define _MM512_REDUCE_OP(op) \
__m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0); \
__m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1); \
__m256h __T3 = __builtin_ia32_##op##ph256_mask (__T1, __T2, \
_mm256_setzero_ph (), (__mmask16) -1); \
__m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0); \
__m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1); \
__m128h __T6 = __builtin_ia32_##op##ph128_mask \
(__T4, __T5, _mm_setzero_ph (),(__mmask8) -1); \
__m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6, \
(__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \
__m128h __T8 = (__m128h) __builtin_ia32_##op##ph128_mask \
(__T6, __T7, _mm_setzero_ph (),(__mmask8) -1); \
__m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8, \
(__v8hi) { 4, 5 }); \
__m128h __T10 = __builtin_ia32_##op##ph128_mask \
(__T8, __T9, _mm_setzero_ph (),(__mmask8) -1); \
__m128h __T11 = (__m128h) __builtin_shuffle (__T10, \
(__v8hi) { 1, 0 }); \
__m128h __T12 = __builtin_ia32_##op##ph128_mask \
(__T10, __T11, _mm_setzero_ph (),(__mmask8) -1); \
return __T12[0]
#else
#define _MM512_REDUCE_OP(op) \
__m512h __T1 = (__m512h) __builtin_shuffle ((__m512d) __A, \
(__v8di) { 4, 5, 6, 7, 0, 0, 0, 0 }); \
__m512h __T2 = _mm512_##op##_ph (__A, __T1); \
__m512h __T3 = (__m512h) __builtin_shuffle ((__m512d) __T2, \
(__v8di) { 2, 3, 0, 0, 0, 0, 0, 0 }); \
__m512h __T4 = _mm512_##op##_ph (__T2, __T3); \
__m512h __T5 = (__m512h) __builtin_shuffle ((__m512d) __T4, \
(__v8di) { 1, 0, 0, 0, 0, 0, 0, 0 }); \
__m512h __T6 = _mm512_##op##_ph (__T4, __T5); \
__m512h __T7 = (__m512h) __builtin_shuffle ((__m512) __T6, \
(__v16si) { 1, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0 }); \
__m512h __T8 = _mm512_##op##_ph (__T6, __T7); \
__m512h __T9 = (__m512h) __builtin_shuffle (__T8, \
(__v32hi) { 1, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0 }); \
__m512h __T10 = _mm512_##op##_ph (__T8, __T9); \
return __T10[0]
#endif
extern __inline _Float16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_min_ph (__m512h __A)
{
_MM512_REDUCE_OP (min);
}
extern __inline _Float16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_reduce_max_ph (__m512h __A)
{
_MM512_REDUCE_OP (max);
}
#undef _MM512_REDUCE_OP
#ifdef __DISABLE_AVX512FP16__
#undef __DISABLE_AVX512FP16__
#pragma GCC pop_options

View File

@ -3095,6 +3095,111 @@ _mm256_maskz_fcmul_pch (__mmask8 __A, __m256h __B, __m256h __C)
__A);
}
#define _MM256_REDUCE_OP(op) \
__m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); \
__m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); \
__m128h __T3 = (__T1 op __T2); \
__m128h __T4 = (__m128h) __builtin_shuffle (__T3, \
(__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \
__m128h __T5 = (__T3) op (__T4); \
__m128h __T6 = (__m128h) __builtin_shuffle (__T5, \
(__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 }); \
__m128h __T7 = __T5 op __T6; \
return __T7[0] op __T7[1]
extern __inline _Float16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_reduce_add_ph (__m256h __A)
{
_MM256_REDUCE_OP (+);
}
extern __inline _Float16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_reduce_mul_ph (__m256h __A)
{
_MM256_REDUCE_OP (*);
}
#undef _MM256_REDUCE_OP
#define _MM256_REDUCE_OP(op) \
__m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); \
__m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); \
__m128h __T3 = _mm_##op (__T1, __T2); \
__m128h __T4 = (__m128h) __builtin_shuffle (__T3, \
(__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \
__m128h __T5 = _mm_##op (__T3, __T4); \
__m128h __T6 = (__m128h) __builtin_shuffle (__T5, (__v8hi) { 4, 5 }); \
__m128h __T7 = _mm_##op (__T5, __T6); \
__m128h __T8 = (__m128h) __builtin_shuffle (__T7, (__v8hi) { 1, 0 }); \
__m128h __T9 = _mm_##op (__T7, __T8); \
return __T9[0]
extern __inline _Float16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_reduce_min_ph (__m256h __A)
{
_MM256_REDUCE_OP (min_ph);
}
extern __inline _Float16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_reduce_max_ph (__m256h __A)
{
_MM256_REDUCE_OP (max_ph);
}
#define _MM_REDUCE_OP(op) \
__m128h __T1 = (__m128h) __builtin_shuffle (__A, \
(__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \
__m128h __T2 = (__A) op (__T1); \
__m128h __T3 = (__m128h) __builtin_shuffle (__T2, \
(__v8hi){ 2, 3, 0, 1, 4, 5, 6, 7 }); \
__m128h __T4 = __T2 op __T3; \
return __T4[0] op __T4[1]
extern __inline _Float16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_reduce_add_ph (__m128h __A)
{
_MM_REDUCE_OP (+);
}
extern __inline _Float16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_reduce_mul_ph (__m128h __A)
{
_MM_REDUCE_OP (*);
}
#undef _MM_REDUCE_OP
#define _MM_REDUCE_OP(op) \
__m128h __T1 = (__m128h) __builtin_shuffle (__A, \
(__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \
__m128h __T2 = _mm_##op (__A, __T1); \
__m128h __T3 = (__m128h) __builtin_shuffle (__T2, (__v8hi){ 4, 5 }); \
__m128h __T4 = _mm_##op (__T2, __T3); \
__m128h __T5 = (__m128h) __builtin_shuffle (__T4, (__v8hi){ 1, 0 }); \
__m128h __T6 = _mm_##op (__T4, __T5); \
return __T6[0]
extern __inline _Float16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_reduce_min_ph (__m128h __A)
{
_MM_REDUCE_OP (min_ph);
}
extern __inline _Float16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_reduce_max_ph (__m128h __A)
{
_MM_REDUCE_OP (max_ph);
}
#undef _MM256_REDUCE_OP
#undef _MM_REDUCE_OP
#ifdef __DISABLE_AVX512FP16VL__
#undef __DISABLE_AVX512FP16VL__
#pragma GCC pop_options

View File

@ -0,0 +1,132 @@
/* { dg-do run { target avx512fp16 } } */
/* { dg-options "-O2 -mavx512fp16" } */
static void do_test (void);
#define DO_TEST do_test
#define AVX512FP16
#include <immintrin.h>
#include "avx512-check.h"
__m512h a1 = { -39.3f16, -180.9f16, 13.4f16, 35.4f16, -41.1f16, -14.4f16, 24.5f16, 53.54f16,
238.4f16, -134.8f16, 24.5f16, 35.6f16, -346.7f16, -43.4f16, -535.3f16, 324.7f16,
82.5f16, 21.4f16, 24.4f16, 53.4f16, 23.5f16, -24.4f16, -34.5f16, -32.5f16,
23.6f16, -13.4f16, 24.5f16, 35.5f16, -34.4f16, -24.5f16, -34.5f16, 13.5f16 };
__m512h a2 = { 1.25f16, 2.25f16, -0.25f16, 4.0f16, -2.0f16, 4.0f16, -3.0f16, 2.0f16,
-0.5f16, -1.0f16, 1.0f16, -1.0f16, 1.0f16, 1.0f16, 2.0f16, 4.0f16,
1.25f16, 2.25f16, -4.25f16, 4.0f16, -2.4f16, 4.0f16, -3.0f, 2.0f16,
-4.5f16, 7.6f16, 0.7f16, -8.2f16, 2.1f16, 2.4f16, -2.0f16, 19.4f16 };
__attribute__((noinline, noclone)) _Float16
test_reduce_add_ph (__m512h a)
{
return _mm512_reduce_add_ph (a);
}
__attribute__((noinline, noclone)) _Float16
test_reduce_mul_ph (__m512h a)
{
return _mm512_reduce_mul_ph (a);
}
__attribute__((noinline, noclone)) _Float16
test_reduce_max_ph (__m512h a)
{
return _mm512_reduce_max_ph (a);
}
__attribute__((noinline, noclone)) _Float16
test_reduce_min_ph (__m512h a)
{
return _mm512_reduce_min_ph (a);
}
#define SIZE 32
#define REF_ADDMUL(op, a) \
__m256h __a1 = _mm256_setzero_ph (); \
for (int i =0; i < 16; i++) { \
__a1[i] = (_Float16) a[i] op (_Float16) a[i + 16]; \
} \
__m128h __a2 = _mm_setzero_ph (); \
for (int i =0; i < 8; i++) { \
__a2[i] = (_Float16) __a1[i] op (_Float16) __a1[i + 8]; \
} \
_Float16 __c0 = __a2[0] op __a2[4]; \
_Float16 __c1 = __a2[1] op __a2[5]; \
_Float16 __c2 = __a2[2] op __a2[6]; \
_Float16 __c3 = __a2[3] op __a2[7]; \
_Float16 __d0 = __c0 op __c2; \
_Float16 __d1 = __c1 op __c3; \
_Float16 __e0 = __d0 op __d1; \
r3 = __e0
#define TESTOP(opname, op, a) \
do { \
_Float16 r1 = _mm512_reduce_##opname##_ph (a); \
_Float16 r2 = test_reduce_##opname##_ph (a); \
_Float16 r3 = a[0]; \
if (r1 != r2) { \
__builtin_abort (); \
} \
REF_ADDMUL (op, a); \
if (r1 != r3) { \
__builtin_abort (); \
} \
} while (0)
#define TEST_ADDMUL_PH(a) \
do { \
TESTOP (add, +, a); \
TESTOP (mul, *, a); \
} while (0)
static void
test_512_addmul_ph (void)
{
TEST_ADDMUL_PH (a1);
TEST_ADDMUL_PH (a2);
}
#undef TESTOP
#define TESTOP(opname, op, a) \
do { \
_Float16 r1 = _mm512_reduce_##opname##_ph (a); \
_Float16 r2 = test_reduce_##opname##_ph (a); \
_Float16 r3 = a[0]; \
if (r1 != r2) { \
__builtin_abort (); \
} \
for (int i = 1; i < SIZE; i++) \
r3 = r3 op a[i]; \
if (r1 != r3) { \
__builtin_abort (); \
} \
} while (0)
#define TEST_MINMAX_PH(a) \
do { \
TESTOP (min, < a[i] ? r3 :, a); \
TESTOP (max, > a[i] ? r3 :, a); \
} while (0)
static void
test_512_minmax_ph (void)
{
TEST_MINMAX_PH (a1);
TEST_MINMAX_PH (a2);
}
static void
do_test (void)
{
test_512_addmul_ph();
test_512_minmax_ph();
}
#undef SIZE
#undef REF_ADDMUL
#undef TESTOP
#undef TEST_ADDMUL_PH
#undef TEST_MINMAX_PH

View File

@ -0,0 +1,244 @@
/* { dg-do run { target avx512fp16 } } */
/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
static void do_test (void);
#define DO_TEST do_test
#define AVX512FP16
#include <immintrin.h>
#include "avx512-check.h"
__m256h a1 = { -39.3f16, -180.9f16, 13.4f16, 35.4f16, -41.1f16, -14.4f16, 24.5f16, 53.54f16,
238.4f16, -134.8f16, 24.5f16, 35.6f16, -346.7f16, -43.4f16, -535.3f16, 324.7f16 };
__m256h a2 = { 82.5f16, 21.4f16, 24.4f16, 53.4f16, 23.5f16, -24.4f16, -34.5f16, -32.5f16,
23.6f16, -13.4f16, 24.5f16, 35.5f16, -34.4f16, -24.5f16, -34.5f16, 13.5f16 };
__m128h b1 = { 1.25f16, 2.25f16, -0.25f16, 4.0f16, -2.0f16, 4.0f16, -3.0f16, 2.0f16 };
__m128h b2 = { -0.5f16, -1.0f16, 1.0f16, -1.0f16, 1.0f16, 1.0f16, 2.0f16, 4.0f16 };
__m128h b3 = { 1.25f16, 2.25f16, -4.25f16, 4.0f16, -2.4f16, 4.0f16, -3.0f, 2.0f16 };
__m128h b4 = { -4.5f16, 7.6f16, 0.7f16, -8.2f16, 2.1f16, 2.4f16, -2.0f16, 1.4f16 };
__attribute__((noinline, noclone)) _Float16
test_reduce_256_add_ph (__m256h a)
{
return _mm256_reduce_add_ph (a);
}
__attribute__((noinline, noclone)) _Float16
test_reduce_256_mul_ph (__m256h a)
{
return _mm256_reduce_mul_ph (a);
}
__attribute__((noinline, noclone)) _Float16
test_reduce_256_max_ph (__m256h a)
{
return _mm256_reduce_max_ph (a);
}
__attribute__((noinline, noclone)) _Float16
test_reduce_256_min_ph (__m256h a)
{
return _mm256_reduce_min_ph (a);
}
__attribute__((noinline, noclone)) _Float16
test_reduce_add_ph (__m128h b)
{
return _mm_reduce_add_ph (b);
}
__attribute__((noinline, noclone)) _Float16
test_reduce_mul_ph (__m128h b)
{
return _mm_reduce_mul_ph (b);
}
__attribute__((noinline, noclone)) _Float16
test_reduce_max_ph (__m128h b)
{
return _mm_reduce_max_ph (b);
}
__attribute__((noinline, noclone)) _Float16
test_reduce_min_ph (__m128h b)
{
return _mm_reduce_min_ph (b);
}
#define SIZE 16
#define REF_ADDMUL(op, a) \
__m128h __a1 = _mm_setzero_ph (); \
for (int i = 0; i < 8; i++) { \
__a1[i] = (_Float16) a[i] op (_Float16) a[i + 8]; \
} \
_Float16 __c0 = __a1[0] op __a1[4]; \
_Float16 __c1 = __a1[1] op __a1[5]; \
_Float16 __c2 = __a1[2] op __a1[6]; \
_Float16 __c3 = __a1[3] op __a1[7]; \
_Float16 __d0 = __c0 op __c2; \
_Float16 __d1 = __c1 op __c3; \
_Float16 __e0 = __d0 op __d1; \
r3 = __e0
#define TESTOP(opname, op, a) \
do { \
_Float16 r1 = _mm256_reduce_##opname##_ph (a); \
_Float16 r2 = test_reduce_256_##opname##_ph (a); \
_Float16 r3 = a[0]; \
if (r1 != r2) { \
__builtin_abort (); \
} \
REF_ADDMUL (op, a); \
if (r1 != r3) { \
__builtin_abort (); \
} \
} while (0)
#define TEST_ADDMUL_PH(a) \
do { \
TESTOP (add, +, a); \
TESTOP (mul, *, a); \
} while (0)
static void
test_256_addmul_ph (void)
{
TEST_ADDMUL_PH (a1);
TEST_ADDMUL_PH (a2);
}
#undef TESTOP
#define TESTOP(opname, op, a) \
do { \
_Float16 r1 = _mm256_reduce_##opname##_ph (a); \
_Float16 r2 = test_reduce_256_##opname##_ph (a); \
_Float16 r3 = a[0]; \
if (r1 != r2) { \
__builtin_abort (); \
} \
for (int i = 1; i < SIZE; i++) \
r3 = r3 op a[i]; \
if (r1 != r3) { \
__builtin_abort (); \
} \
} while (0)
#define TEST_MINMAX_PH(a) \
do { \
TESTOP (min, < a[i] ? r3 :, a); \
TESTOP (max, > a[i] ? r3 :, a); \
} while (0)
static void
test_256_minmax_ph (void)
{
TEST_MINMAX_PH (a1);
TEST_MINMAX_PH (a2);
}
static void
test_256_ph (void)
{
test_256_addmul_ph ();
test_256_minmax_ph ();
}
#undef SIZE
#define SIZE 8
#undef REF_ADDMUL
#define REF_ADDMUL(op, a) \
_Float16 __c0 = a[0] op a[4]; \
_Float16 __c1 = a[1] op a[5]; \
_Float16 __c2 = a[2] op a[6]; \
_Float16 __c3 = a[3] op a[7]; \
_Float16 __d0 = __c0 op __c2; \
_Float16 __d1 = __c1 op __c3; \
_Float16 __e0 = __d0 op __d1; \
r3 = __e0
#undef TESTOP
#define TESTOP(opname, op, a) \
do { \
_Float16 r1 = _mm_reduce_##opname##_ph (a); \
_Float16 r2 = test_reduce_##opname##_ph (a); \
_Float16 r3 = a[0]; \
if (r1 != r2) { \
__builtin_abort (); \
} \
REF_ADDMUL (op, a); \
if (r1 != r3) { \
__builtin_abort (); \
} \
} while (0)
#undef TEST_ADDMUL_PH
#define TEST_ADDMUL_PH(a) \
do { \
TESTOP (add, +, a); \
TESTOP (mul, *, a); \
} while (0)
static void
test_128_addmul_ph (void)
{
TEST_ADDMUL_PH (b1);
TEST_ADDMUL_PH (b2);
TEST_ADDMUL_PH (b3);
TEST_ADDMUL_PH (b4);
}
#undef TESTOP
#define TESTOP(opname, op, b) \
do { \
_Float16 r1 = _mm_reduce_##opname##_ph (b); \
_Float16 r2 = test_reduce_##opname##_ph (b); \
_Float16 r3 = b[0]; \
if (r1 != r2) { \
__builtin_abort (); \
} \
for (int i = 1; i < SIZE; i++) \
r3 = r3 op b[i]; \
if (r1 != r3) { \
__builtin_abort (); \
} \
} while (0)
#undef TEST_MINMAX_PH
#define TEST_MINMAX_PH(b) \
do { \
TESTOP (min, < b[i] ? r3 :, b); \
TESTOP (max, > b[i] ? r3 :, b); \
} while (0)
static void
test_128_minmax_ph (void)
{
TEST_MINMAX_PH (b1);
TEST_MINMAX_PH (b2);
TEST_MINMAX_PH (b3);
TEST_MINMAX_PH (b4);
}
static void
test_128_ph (void)
{
test_128_addmul_ph ();
test_128_minmax_ph ();
}
static void
do_test (void)
{
test_256_ph ();
test_128_ph ();
}
#undef SIZE
#undef REF_ADDMUL
#undef TESTOP
#undef TEST_ADDMUL_PH
#undef TEST_MINMAX_PH