From 274c2d3f146e70e7205a2472c6f4d9a93380a80b Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Fri, 17 Nov 2017 18:53:49 +0100 Subject: [PATCH] i386.c (ix86_multiplication_cost, [...]): Break out from ... * i386.c (ix86_multiplication_cost, ix86_division_cost, ix86_shift_rotate_cost): Break out from ... (ix86_rtx_costs): ... here. (ix86_add_stmt_cost): Use rtx cost machinery to compute cost of vector operations. * gcc.target/i386/xop-imul64-vector.c: Disable vectorizer costmodel. From-SVN: r254889 --- gcc/ChangeLog | 8 + gcc/config/i386/i386.c | 420 ++++++++++++------ gcc/testsuite/ChangeLog | 4 + .../gcc.target/i386/xop-imul64-vector.c | 2 +- 4 files changed, 305 insertions(+), 129 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 3864ff2fa05..9d96f9564ef 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2017-11-17 Jan Hubicka + + * i386.c (ix86_multiplication_cost, ix86_division_cost, + ix86_shift_rotate_cost): Break out from ... + (ix86_rtx_costs): ... here. + (ix86_add_stmt_cost): Use rtx cost machinery to compute cost of + vector operations. + 2017-11-17 Jan Hubicka * predict.c (determine_unlikely_bbs): Set cgraph node count to 0 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index a53666966d6..835c73db9d2 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -39285,6 +39285,168 @@ ix86_vec_cost (machine_mode mode, int cost, bool parallel) return cost; } +/* Return cost of multiplication in MODE. */ + +static int +ix86_multiplication_cost (const struct processor_costs *cost, + enum machine_mode mode) +{ + machine_mode inner_mode = mode; + if (VECTOR_MODE_P (mode)) + inner_mode = GET_MODE_INNER (mode); + + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + return inner_mode == DFmode ? cost->mulsd : cost->mulss; + else if (X87_FLOAT_MODE_P (mode)) + return cost->fmul; + else if (FLOAT_MODE_P (mode)) + return ix86_vec_cost (mode, + inner_mode == DFmode + ? cost->mulsd : cost->mulss, true); + else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + { + /* V*QImode is emulated with 7-13 insns. */ + if (mode == V16QImode || mode == V32QImode) + { + int extra = 11; + if (TARGET_XOP && mode == V16QImode) + extra = 5; + else if (TARGET_SSSE3) + extra = 6; + return ix86_vec_cost (mode, + cost->mulss * 2 + cost->sse_op * extra, + true); + } + /* V*DImode is emulated with 5-8 insns. */ + else if (mode == V2DImode || mode == V4DImode) + { + if (TARGET_XOP && mode == V2DImode) + return ix86_vec_cost (mode, + cost->mulss * 2 + cost->sse_op * 3, + true); + else + return ix86_vec_cost (mode, + cost->mulss * 3 + cost->sse_op * 5, + true); + } + /* Without sse4.1, we don't have PMULLD; it's emulated with 7 + insns, including two PMULUDQ. */ + else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX)) + return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5, + true); + else + return ix86_vec_cost (mode, cost->mulss, true); + } + else + return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7); +} + +/* Return cost of multiplication in MODE. */ + +static int +ix86_division_cost (const struct processor_costs *cost, + enum machine_mode mode) +{ + machine_mode inner_mode = mode; + if (VECTOR_MODE_P (mode)) + inner_mode = GET_MODE_INNER (mode); + + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + return inner_mode == DFmode ? cost->divsd : cost->divss; + else if (X87_FLOAT_MODE_P (mode)) + return cost->fdiv; + else if (FLOAT_MODE_P (mode)) + return ix86_vec_cost (mode, + inner_mode == DFmode ? cost->divsd : cost->divss, + true); + else + return cost->divide[MODE_INDEX (mode)]; +} + +/* Return cost of shift in MODE. + If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL. + AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE + if op1 is a result of subreg. + + SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */ + +static int +ix86_shift_rotate_cost (const struct processor_costs *cost, + enum machine_mode mode, bool constant_op1, + HOST_WIDE_INT op1_val, + bool speed, + bool and_in_op1, + bool shift_and_truncate, + bool *skip_op0, bool *skip_op1) +{ + if (skip_op0) + *skip_op0 = *skip_op1 = false; + if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + { + /* V*QImode is emulated with 1-11 insns. */ + if (mode == V16QImode || mode == V32QImode) + { + int count = 11; + if (TARGET_XOP && mode == V16QImode) + { + /* For XOP we use vpshab, which requires a broadcast of the + value to the variable shift insn. For constants this + means a V16Q const in mem; even when we can perform the + shift with one insn set the cost to prefer paddb. */ + if (constant_op1) + { + if (skip_op1) + *skip_op1 = true; + return ix86_vec_cost (mode, + cost->sse_op + + (speed + ? 2 + : COSTS_N_BYTES + (GET_MODE_UNIT_SIZE (mode))), true); + } + count = 3; + } + else if (TARGET_SSSE3) + count = 7; + return ix86_vec_cost (mode, cost->sse_op * count, true); + } + else + return ix86_vec_cost (mode, cost->sse_op, true); + } + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) + { + if (constant_op1) + { + if (op1_val > 32) + return cost->shift_const + COSTS_N_INSNS (2); + else + return cost->shift_const * 2; + } + else + { + if (and_in_op1) + return cost->shift_var * 2; + else + return cost->shift_var * 6 + COSTS_N_INSNS (2); + } + } + else + { + if (constant_op1) + return cost->shift_const; + else if (shift_and_truncate) + { + if (skip_op0) + *skip_op0 = *skip_op1 = true; + /* Return the cost after shift-and truncation. */ + return cost->shift_var; + } + else + return cost->shift_var; + } + return cost->shift_const; +} + /* Compute a (partial) cost for rtx X. Return true if the complete cost has been computed, and false if subexpressions should be scanned. In either case, *TOTAL contains the cost result. */ @@ -39298,9 +39460,6 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, enum rtx_code outer_code = (enum rtx_code) outer_code_i; const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost; int src_cost; - machine_mode inner_mode = mode; - if (VECTOR_MODE_P (mode)) - inner_mode = GET_MODE_INNER (mode); switch (code) { @@ -39428,68 +39587,22 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, case ASHIFTRT: case LSHIFTRT: case ROTATERT: - if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + bool skip_op0, skip_op1; + *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)), + CONST_INT_P (XEXP (x, 1)) + ? INTVAL (XEXP (x, 1)) : -1, + speed, + GET_CODE (XEXP (x, 1)) == AND, + SUBREG_P (XEXP (x, 1)) + && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND, + &skip_op0, &skip_op1); + if (skip_op0 || skip_op1) { - /* ??? Should be SSE vector operation cost. */ - /* At least for published AMD latencies, this really is the same - as the latency for a simple fpu operation like fabs. */ - /* V*QImode is emulated with 1-11 insns. */ - if (mode == V16QImode || mode == V32QImode) - { - int count = 11; - if (TARGET_XOP && mode == V16QImode) - { - /* For XOP we use vpshab, which requires a broadcast of the - value to the variable shift insn. For constants this - means a V16Q const in mem; even when we can perform the - shift with one insn set the cost to prefer paddb. */ - if (CONSTANT_P (XEXP (x, 1))) - { - *total = ix86_vec_cost (mode, - cost->sse_op - + rtx_cost (XEXP (x, 0), mode, code, 0, speed) - + (speed ? 2 : COSTS_N_BYTES (16)), true); - return true; - } - count = 3; - } - else if (TARGET_SSSE3) - count = 7; - *total = ix86_vec_cost (mode, cost->sse_op * count, true); - } - else - *total = ix86_vec_cost (mode, cost->sse_op, true); - } - else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) - { - if (CONST_INT_P (XEXP (x, 1))) - { - if (INTVAL (XEXP (x, 1)) > 32) - *total = cost->shift_const + COSTS_N_INSNS (2); - else - *total = cost->shift_const * 2; - } - else - { - if (GET_CODE (XEXP (x, 1)) == AND) - *total = cost->shift_var * 2; - else - *total = cost->shift_var * 6 + COSTS_N_INSNS (2); - } - } - else - { - if (CONST_INT_P (XEXP (x, 1))) - *total = cost->shift_const; - else if (SUBREG_P (XEXP (x, 1)) - && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND) - { - /* Return the cost after shift-and truncation. */ - *total = cost->shift_var; - return true; - } - else - *total = cost->shift_var; + if (!skip_op0) + *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed); + if (!skip_op1) + *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed); + return true; } return false; @@ -39519,59 +39632,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, } case MULT: - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - { - *total = inner_mode == DFmode ? cost->mulsd : cost->mulss; - return false; - } - else if (X87_FLOAT_MODE_P (mode)) - { - *total = cost->fmul; - return false; - } - else if (FLOAT_MODE_P (mode)) - { - *total = ix86_vec_cost (mode, - inner_mode == DFmode - ? cost->mulsd : cost->mulss, true); - return false; - } - else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - { - /* V*QImode is emulated with 7-13 insns. */ - if (mode == V16QImode || mode == V32QImode) - { - int extra = 11; - if (TARGET_XOP && mode == V16QImode) - extra = 5; - else if (TARGET_SSSE3) - extra = 6; - *total = ix86_vec_cost (mode, - cost->mulss * 2 + cost->sse_op * extra, - true); - } - /* V*DImode is emulated with 5-8 insns. */ - else if (mode == V2DImode || mode == V4DImode) - { - if (TARGET_XOP && mode == V2DImode) - *total = ix86_vec_cost (mode, - cost->mulss * 2 + cost->sse_op * 3, - true); - else - *total = ix86_vec_cost (mode, - cost->mulss * 3 + cost->sse_op * 5, - true); - } - /* Without sse4.1, we don't have PMULLD; it's emulated with 7 - insns, including two PMULUDQ. */ - else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX)) - *total = ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5, - true); - else - *total = ix86_vec_cost (mode, cost->mulss, true); - return false; - } - else + if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode)) { rtx op0 = XEXP (x, 0); rtx op1 = XEXP (x, 1); @@ -39616,21 +39677,14 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, return true; } + *total = ix86_multiplication_cost (cost, mode); + return false; case DIV: case UDIV: case MOD: case UMOD: - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - *total = inner_mode == DFmode ? cost->divsd : cost->divss; - else if (X87_FLOAT_MODE_P (mode)) - *total = cost->fdiv; - else if (FLOAT_MODE_P (mode)) - *total = ix86_vec_cost (mode, - inner_mode == DFmode ? cost->divsd : cost->divss, - true); - else - *total = cost->divide[MODE_INDEX (mode)]; + *total = ix86_division_cost (cost, mode); return false; case PLUS: @@ -48827,7 +48881,117 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, unsigned retval = 0; tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE; - int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + int stmt_cost = - 1; + + if ((kind == vector_stmt || kind == scalar_stmt) + && stmt_info + && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN) + { + tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt); + bool fp = false; + machine_mode mode = TImode; + + if (vectype != NULL) + { + fp = FLOAT_TYPE_P (vectype); + mode = TYPE_MODE (vectype); + } + /*machine_mode inner_mode = mode; + if (VECTOR_MODE_P (mode)) + inner_mode = GET_MODE_INNER (mode);*/ + + switch (subcode) + { + case PLUS_EXPR: + case POINTER_PLUS_EXPR: + case MINUS_EXPR: + if (kind == scalar_stmt) + { + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + stmt_cost = ix86_cost->addss; + else if (X87_FLOAT_MODE_P (mode)) + stmt_cost = ix86_cost->fadd; + else + stmt_cost = ix86_cost->add; + } + else + stmt_cost = ix86_vec_cost (mode, + fp ? ix86_cost->addss + : ix86_cost->sse_op, + true); + break; + + case MULT_EXPR: + case WIDEN_MULT_EXPR: + case MULT_HIGHPART_EXPR: + stmt_cost = ix86_multiplication_cost (ix86_cost, mode); + break; + case FMA_EXPR: + stmt_cost = ix86_vec_cost (mode, + mode == SFmode ? ix86_cost->fmass + : ix86_cost->fmasd, + true); + break; + case NEGATE_EXPR: + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + stmt_cost = ix86_cost->sse_op; + else if (X87_FLOAT_MODE_P (mode)) + stmt_cost = ix86_cost->fchs; + else if (VECTOR_MODE_P (mode)) + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true); + else + stmt_cost = ix86_cost->add; + break; + case TRUNC_DIV_EXPR: + case CEIL_DIV_EXPR: + case FLOOR_DIV_EXPR: + case ROUND_DIV_EXPR: + case TRUNC_MOD_EXPR: + case CEIL_MOD_EXPR: + case FLOOR_MOD_EXPR: + case RDIV_EXPR: + case ROUND_MOD_EXPR: + case EXACT_DIV_EXPR: + stmt_cost = ix86_division_cost (ix86_cost, mode); + break; + + case RSHIFT_EXPR: + case LSHIFT_EXPR: + case LROTATE_EXPR: + case RROTATE_EXPR: + { + tree op2 = gimple_assign_rhs2 (stmt_info->stmt); + stmt_cost = ix86_shift_rotate_cost + (ix86_cost, mode, + TREE_CODE (op2) == INTEGER_CST, + cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1, + true, false, false, NULL, NULL); + } + break; + case NOP_EXPR: + stmt_cost = 0; + break; + + case BIT_IOR_EXPR: + case ABS_EXPR: + case MIN_EXPR: + case MAX_EXPR: + case BIT_XOR_EXPR: + case BIT_AND_EXPR: + case BIT_NOT_EXPR: + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + stmt_cost = ix86_cost->sse_op; + else if (VECTOR_MODE_P (mode)) + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true); + else + stmt_cost = ix86_cost->add; + break; + default: + break; + } + } + if (stmt_cost == -1) + stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); /* Penalize DFmode vector operations for Bonnell. */ if (TARGET_BONNELL && kind == vector_stmt diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 149ed11a502..c75df82e97d 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2017-11-17 Jan Hubicka + + * gcc.target/i386/xop-imul64-vector.c: Disable vectorizer costmodel. + 2017-11-17 Jan Hubicka * gcc.dg/tree-ssa/dump-2.c: Fixup template for profile precision diff --git a/gcc/testsuite/gcc.target/i386/xop-imul64-vector.c b/gcc/testsuite/gcc.target/i386/xop-imul64-vector.c index fc8c8806f78..43a27a5cb24 100644 --- a/gcc/testsuite/gcc.target/i386/xop-imul64-vector.c +++ b/gcc/testsuite/gcc.target/i386/xop-imul64-vector.c @@ -3,7 +3,7 @@ /* { dg-do compile } */ /* { dg-require-effective-target lp64 } */ -/* { dg-options "-O2 -mxop -mno-avx2 -ftree-vectorize" } */ +/* { dg-options "-O2 -mxop -mno-avx2 -ftree-vectorize -fno-vect-cost-model" } */ extern void exit (int);