x86: Tune Skylake, Cannonlake and Icelake as Haswell
r259399, which added PROCESSOR_SKYLAKE, disabled many x86 optimizations which are enabled by PROCESSOR_HASWELL. As the result, -mtune=skylake generates slower codes on Skylake than before. The same also applies to Cannonlake and Icelak tuning. This patch changes -mtune={skylake|cannonlake|icelake} to tune like -mtune=haswell for until their tuning is properly adjusted. It also enables -mprefer-vector-width=256 for -mtune=haswell, which has no impact on codegen when AVX512 isn't enabled. Performance impacts on SPEC CPU 2017 rate with 1 copy using -march=native -mfpmath=sse -O2 -m64 are 1. On Broadwell server: 500.perlbench_r -0.56% 502.gcc_r -0.18% 505.mcf_r 0.24% 520.omnetpp_r 0.00% 523.xalancbmk_r -0.32% 525.x264_r -0.17% 531.deepsjeng_r 0.00% 541.leela_r 0.00% 548.exchange2_r 0.12% 557.xz_r 0.00% Geomean 0.00% 503.bwaves_r 0.00% 507.cactuBSSN_r 0.21% 508.namd_r 0.00% 510.parest_r 0.19% 511.povray_r -0.48% 519.lbm_r 0.00% 521.wrf_r 0.28% 526.blender_r 0.19% 527.cam4_r 0.39% 538.imagick_r 0.00% 544.nab_r -0.36% 549.fotonik3d_r 0.51% 554.roms_r 0.00% Geomean 0.17% On Skylake client: 500.perlbench_r 0.96% 502.gcc_r 0.13% 505.mcf_r -1.03% 520.omnetpp_r -1.11% 523.xalancbmk_r 1.02% 525.x264_r 0.50% 531.deepsjeng_r 2.97% 541.leela_r 0.50% 548.exchange2_r -0.95% 557.xz_r 2.41% Geomean 0.56% 503.bwaves_r 0.49% 507.cactuBSSN_r 3.17% 508.namd_r 4.05% 510.parest_r 0.15% 511.povray_r 0.80% 519.lbm_r 3.15% 521.wrf_r 10.56% 526.blender_r 2.97% 527.cam4_r 2.36% 538.imagick_r 46.40% 544.nab_r 2.04% 549.fotonik3d_r 0.00% 554.roms_r 1.27% Geomean 5.49% On Skylake server: 500.perlbench_r 0.71% 502.gcc_r -0.51% 505.mcf_r -1.06% 520.omnetpp_r -0.33% 523.xalancbmk_r -0.22% 525.x264_r 1.72% 531.deepsjeng_r -0.26% 541.leela_r 0.57% 548.exchange2_r -0.75% 557.xz_r -1.28% Geomean -0.21% 503.bwaves_r 0.00% 507.cactuBSSN_r 2.66% 508.namd_r 3.67% 510.parest_r 1.25% 511.povray_r 2.26% 519.lbm_r 1.69% 521.wrf_r 11.03% 526.blender_r 3.39% 527.cam4_r 1.69% 538.imagick_r 64.59% 544.nab_r -0.54% 549.fotonik3d_r 2.68% 554.roms_r 0.00% Geomean 6.19% This patch improves -march=native performance on Skylake up to 60% and leaves -march=native performance unchanged on Haswell. gcc/ 2018-07-13 H.J. Lu <hongjiu.lu@intel.com> Sunil K Pandey <sunil.k.pandey@intel.com> PR target/84413 * config/i386/i386.c (m_CORE_AVX512): New. (m_CORE_AVX2): Likewise. (m_CORE_ALL): Add m_CORE_AVX2. * config/i386/x86-tune.def: Replace m_HASWELL with m_CORE_AVX2. Replace m_SKYLAKE_AVX512 with m_CORE_AVX512 on avx256_optimal and remove the rest of m_SKYLAKE_AVX512. gcc/testsuite/ 2018-07-13 H.J. Lu <hongjiu.lu@intel.com> Sunil K Pandey <sunil.k.pandey@intel.com> PR target/84413 * gcc.target/i386/pr84413-1.c: New test. * gcc.target/i386/pr84413-2.c: Likewise. * gcc.target/i386/pr84413-3.c: Likewise. Co-Authored-By: Sunil K Pandey <sunil.k.pandey@intel.com> From-SVN: r262649
This commit is contained in:
parent
814f333187
commit
7264261f64
11
ChangeLog
11
ChangeLog
|
@ -1,3 +1,14 @@
|
|||
2018-07-13 H.J. Lu <hongjiu.lu@intel.com>
|
||||
Sunil K Pandey <sunil.k.pandey@intel.com>
|
||||
|
||||
PR target/84413
|
||||
* config/i386/i386.c (m_CORE_AVX512): New.
|
||||
(m_CORE_AVX2): Likewise.
|
||||
(m_CORE_ALL): Add m_CORE_AVX2.
|
||||
* config/i386/x86-tune.def: Replace m_HASWELL with m_CORE_AVX2.
|
||||
Replace m_SKYLAKE_AVX512 with m_CORE_AVX512 on avx256_optimal
|
||||
and remove the rest of m_SKYLAKE_AVX512.
|
||||
|
||||
2018-07-06 Sebastian Huber <sebastian.huber@embedded-brains.de>
|
||||
|
||||
* config.sub: Sync with upstream version 2018-07-03.
|
||||
|
|
|
@ -138,7 +138,6 @@ const struct processor_costs *ix86_cost = NULL;
|
|||
#define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM)
|
||||
#define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE)
|
||||
#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL)
|
||||
#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
|
||||
#define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL)
|
||||
#define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT)
|
||||
#define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL)
|
||||
|
@ -148,6 +147,10 @@ const struct processor_costs *ix86_cost = NULL;
|
|||
#define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE)
|
||||
#define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT)
|
||||
#define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)
|
||||
#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \
|
||||
| m_ICELAKE_CLIENT | m_ICELAKE_SERVER)
|
||||
#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512)
|
||||
#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2)
|
||||
#define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT)
|
||||
#define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
|
||||
#define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
|
||||
|
|
|
@ -49,9 +49,9 @@ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
|
|||
over partial stores. For example preffer MOVZBL or MOVQ to load 8bit
|
||||
value over movb. */
|
||||
DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
|
||||
m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL
|
||||
m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2
|
||||
| m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
|
||||
| m_KNL | m_KNM | m_AMD_MULTIPLE | m_SKYLAKE_AVX512 | m_TREMONT
|
||||
| m_KNL | m_KNM | m_AMD_MULTIPLE | m_TREMONT
|
||||
| m_GENERIC)
|
||||
|
||||
/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
|
||||
|
@ -87,8 +87,8 @@ DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
|
|||
DEF_TUNE (X86_TUNE_MOVX, "movx",
|
||||
m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
|
||||
| m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_KNL | m_KNM | m_INTEL
|
||||
| m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE | m_SKYLAKE_AVX512
|
||||
| m_HASWELL | m_TREMONT | m_GENERIC)
|
||||
| m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE
|
||||
| m_CORE_AVX2 | m_TREMONT | m_GENERIC)
|
||||
|
||||
/* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
|
||||
full sized loads. */
|
||||
|
@ -105,19 +105,19 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
|
|||
/* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
|
||||
conditional jump instruction for TARGET_64BIT. */
|
||||
DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
|
||||
m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1 | m_GENERIC)
|
||||
m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | m_GENERIC)
|
||||
|
||||
/* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
|
||||
subsequent conditional jump instruction when the condition jump
|
||||
check sign flag (SF) or overflow flag (OF). */
|
||||
DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
|
||||
m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1 | m_GENERIC)
|
||||
m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER | m_ZNVER1 | m_GENERIC)
|
||||
|
||||
/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
|
||||
jump instruction when the alu instruction produces the CCFLAG consumed by
|
||||
the conditional jump instruction. */
|
||||
DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
|
||||
m_SANDYBRIDGE | m_HASWELL | m_GENERIC)
|
||||
m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
|
||||
|
||||
|
||||
/*****************************************************************************/
|
||||
|
@ -297,7 +297,7 @@ DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
|
|||
/* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
|
||||
for bit-manipulation instructions. */
|
||||
DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
|
||||
m_SANDYBRIDGE | m_HASWELL | m_GENERIC)
|
||||
m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
|
||||
|
||||
/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
|
||||
on hardware capabilities. Bdver3 hardware has a loop buffer which makes
|
||||
|
@ -349,15 +349,15 @@ DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
|
|||
/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
|
||||
of a sequence loading registers by parts. */
|
||||
DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
|
||||
m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM
|
||||
| m_INTEL | m_SKYLAKE_AVX512 | m_GOLDMONT | m_GOLDMONT_PLUS
|
||||
m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
|
||||
| m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
|
||||
| m_TREMONT | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER1 | m_GENERIC)
|
||||
|
||||
/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
|
||||
of a sequence loading registers by parts. */
|
||||
DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
|
||||
m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM
|
||||
| m_INTEL | m_SKYLAKE_AVX512 | m_GOLDMONT | m_GOLDMONT_PLUS
|
||||
m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
|
||||
| m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
|
||||
| m_TREMONT | m_BDVER | m_ZNVER1 | m_GENERIC)
|
||||
|
||||
/* Use packed single precision instructions where posisble. I.e. movups instead
|
||||
|
@ -446,7 +446,7 @@ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
|
|||
|
||||
/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
|
||||
instructions in the auto-vectorizer. */
|
||||
DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_SKYLAKE_AVX512)
|
||||
DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
|
||||
|
||||
/*****************************************************************************/
|
||||
/* Historical relics: tuning flags that helps a specific old CPU designs */
|
||||
|
|
|
@ -1,3 +1,11 @@
|
|||
2018-07-13 H.J. Lu <hongjiu.lu@intel.com>
|
||||
Sunil K Pandey <sunil.k.pandey@intel.com>
|
||||
|
||||
PR target/84413
|
||||
* gcc.target/i386/pr84413-1.c: New test.
|
||||
* gcc.target/i386/pr84413-2.c: Likewise.
|
||||
* gcc.target/i386/pr84413-3.c: Likewise.
|
||||
|
||||
2018-07-13 Bill Schmidt <wschmidt@linux.ibm.com>
|
||||
Steve Munroe <munroesj52@gmail.com>
|
||||
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O3 -march=skylake-avx512" } */
|
||||
/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
|
||||
/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */
|
||||
|
||||
#define N 1024
|
||||
|
||||
double a[N], b[N], c[N];
|
||||
|
||||
void
|
||||
avx512f_test (void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
c[i] = a[i] * b[i];
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O3 -march=cannonlake" } */
|
||||
/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
|
||||
/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */
|
||||
|
||||
#define N 1024
|
||||
|
||||
double a[N], b[N], c[N];
|
||||
|
||||
void
|
||||
avx512f_test (void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
c[i] = a[i] * b[i];
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O3 -march=icelake-server" } */
|
||||
/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
|
||||
/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */
|
||||
|
||||
#define N 1024
|
||||
|
||||
double a[N], b[N], c[N];
|
||||
|
||||
void
|
||||
avx512f_test (void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
c[i] = a[i] * b[i];
|
||||
}
|
Loading…
Reference in New Issue