i386.c (ix86_multiplication_cost, [...]): Break out from ...
* i386.c (ix86_multiplication_cost, ix86_division_cost, ix86_shift_rotate_cost): Break out from ... (ix86_rtx_costs): ... here. (ix86_add_stmt_cost): Use rtx cost machinery to compute cost of vector operations. * gcc.target/i386/xop-imul64-vector.c: Disable vectorizer costmodel. From-SVN: r254889
This commit is contained in:
parent
650fe7323c
commit
274c2d3f14
@ -1,3 +1,11 @@
|
||||
2017-11-17 Jan Hubicka <hubicka@ucw.cz>
|
||||
|
||||
* i386.c (ix86_multiplication_cost, ix86_division_cost,
|
||||
ix86_shift_rotate_cost): Break out from ...
|
||||
(ix86_rtx_costs): ... here.
|
||||
(ix86_add_stmt_cost): Use rtx cost machinery to compute cost of
|
||||
vector operations.
|
||||
|
||||
2017-11-17 Jan Hubicka <hubicka@ucw.cz>
|
||||
|
||||
* predict.c (determine_unlikely_bbs): Set cgraph node count to 0
|
||||
|
@ -39285,6 +39285,168 @@ ix86_vec_cost (machine_mode mode, int cost, bool parallel)
|
||||
return cost;
|
||||
}
|
||||
|
||||
/* Return cost of multiplication in MODE. */
|
||||
|
||||
static int
|
||||
ix86_multiplication_cost (const struct processor_costs *cost,
|
||||
enum machine_mode mode)
|
||||
{
|
||||
machine_mode inner_mode = mode;
|
||||
if (VECTOR_MODE_P (mode))
|
||||
inner_mode = GET_MODE_INNER (mode);
|
||||
|
||||
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
|
||||
return inner_mode == DFmode ? cost->mulsd : cost->mulss;
|
||||
else if (X87_FLOAT_MODE_P (mode))
|
||||
return cost->fmul;
|
||||
else if (FLOAT_MODE_P (mode))
|
||||
return ix86_vec_cost (mode,
|
||||
inner_mode == DFmode
|
||||
? cost->mulsd : cost->mulss, true);
|
||||
else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
|
||||
{
|
||||
/* V*QImode is emulated with 7-13 insns. */
|
||||
if (mode == V16QImode || mode == V32QImode)
|
||||
{
|
||||
int extra = 11;
|
||||
if (TARGET_XOP && mode == V16QImode)
|
||||
extra = 5;
|
||||
else if (TARGET_SSSE3)
|
||||
extra = 6;
|
||||
return ix86_vec_cost (mode,
|
||||
cost->mulss * 2 + cost->sse_op * extra,
|
||||
true);
|
||||
}
|
||||
/* V*DImode is emulated with 5-8 insns. */
|
||||
else if (mode == V2DImode || mode == V4DImode)
|
||||
{
|
||||
if (TARGET_XOP && mode == V2DImode)
|
||||
return ix86_vec_cost (mode,
|
||||
cost->mulss * 2 + cost->sse_op * 3,
|
||||
true);
|
||||
else
|
||||
return ix86_vec_cost (mode,
|
||||
cost->mulss * 3 + cost->sse_op * 5,
|
||||
true);
|
||||
}
|
||||
/* Without sse4.1, we don't have PMULLD; it's emulated with 7
|
||||
insns, including two PMULUDQ. */
|
||||
else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
|
||||
return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
|
||||
true);
|
||||
else
|
||||
return ix86_vec_cost (mode, cost->mulss, true);
|
||||
}
|
||||
else
|
||||
return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
|
||||
}
|
||||
|
||||
/* Return cost of multiplication in MODE. */
|
||||
|
||||
static int
|
||||
ix86_division_cost (const struct processor_costs *cost,
|
||||
enum machine_mode mode)
|
||||
{
|
||||
machine_mode inner_mode = mode;
|
||||
if (VECTOR_MODE_P (mode))
|
||||
inner_mode = GET_MODE_INNER (mode);
|
||||
|
||||
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
|
||||
return inner_mode == DFmode ? cost->divsd : cost->divss;
|
||||
else if (X87_FLOAT_MODE_P (mode))
|
||||
return cost->fdiv;
|
||||
else if (FLOAT_MODE_P (mode))
|
||||
return ix86_vec_cost (mode,
|
||||
inner_mode == DFmode ? cost->divsd : cost->divss,
|
||||
true);
|
||||
else
|
||||
return cost->divide[MODE_INDEX (mode)];
|
||||
}
|
||||
|
||||
/* Return cost of shift in MODE.
|
||||
If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
|
||||
AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
|
||||
if op1 is a result of subreg.
|
||||
|
||||
SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
|
||||
|
||||
static int
|
||||
ix86_shift_rotate_cost (const struct processor_costs *cost,
|
||||
enum machine_mode mode, bool constant_op1,
|
||||
HOST_WIDE_INT op1_val,
|
||||
bool speed,
|
||||
bool and_in_op1,
|
||||
bool shift_and_truncate,
|
||||
bool *skip_op0, bool *skip_op1)
|
||||
{
|
||||
if (skip_op0)
|
||||
*skip_op0 = *skip_op1 = false;
|
||||
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
|
||||
{
|
||||
/* V*QImode is emulated with 1-11 insns. */
|
||||
if (mode == V16QImode || mode == V32QImode)
|
||||
{
|
||||
int count = 11;
|
||||
if (TARGET_XOP && mode == V16QImode)
|
||||
{
|
||||
/* For XOP we use vpshab, which requires a broadcast of the
|
||||
value to the variable shift insn. For constants this
|
||||
means a V16Q const in mem; even when we can perform the
|
||||
shift with one insn set the cost to prefer paddb. */
|
||||
if (constant_op1)
|
||||
{
|
||||
if (skip_op1)
|
||||
*skip_op1 = true;
|
||||
return ix86_vec_cost (mode,
|
||||
cost->sse_op
|
||||
+ (speed
|
||||
? 2
|
||||
: COSTS_N_BYTES
|
||||
(GET_MODE_UNIT_SIZE (mode))), true);
|
||||
}
|
||||
count = 3;
|
||||
}
|
||||
else if (TARGET_SSSE3)
|
||||
count = 7;
|
||||
return ix86_vec_cost (mode, cost->sse_op * count, true);
|
||||
}
|
||||
else
|
||||
return ix86_vec_cost (mode, cost->sse_op, true);
|
||||
}
|
||||
if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
|
||||
{
|
||||
if (constant_op1)
|
||||
{
|
||||
if (op1_val > 32)
|
||||
return cost->shift_const + COSTS_N_INSNS (2);
|
||||
else
|
||||
return cost->shift_const * 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (and_in_op1)
|
||||
return cost->shift_var * 2;
|
||||
else
|
||||
return cost->shift_var * 6 + COSTS_N_INSNS (2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (constant_op1)
|
||||
return cost->shift_const;
|
||||
else if (shift_and_truncate)
|
||||
{
|
||||
if (skip_op0)
|
||||
*skip_op0 = *skip_op1 = true;
|
||||
/* Return the cost after shift-and truncation. */
|
||||
return cost->shift_var;
|
||||
}
|
||||
else
|
||||
return cost->shift_var;
|
||||
}
|
||||
return cost->shift_const;
|
||||
}
|
||||
|
||||
/* Compute a (partial) cost for rtx X. Return true if the complete
|
||||
cost has been computed, and false if subexpressions should be
|
||||
scanned. In either case, *TOTAL contains the cost result. */
|
||||
@ -39298,9 +39460,6 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
|
||||
enum rtx_code outer_code = (enum rtx_code) outer_code_i;
|
||||
const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
|
||||
int src_cost;
|
||||
machine_mode inner_mode = mode;
|
||||
if (VECTOR_MODE_P (mode))
|
||||
inner_mode = GET_MODE_INNER (mode);
|
||||
|
||||
switch (code)
|
||||
{
|
||||
@ -39428,68 +39587,22 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
|
||||
case ASHIFTRT:
|
||||
case LSHIFTRT:
|
||||
case ROTATERT:
|
||||
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
|
||||
bool skip_op0, skip_op1;
|
||||
*total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
|
||||
CONST_INT_P (XEXP (x, 1))
|
||||
? INTVAL (XEXP (x, 1)) : -1,
|
||||
speed,
|
||||
GET_CODE (XEXP (x, 1)) == AND,
|
||||
SUBREG_P (XEXP (x, 1))
|
||||
&& GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
|
||||
&skip_op0, &skip_op1);
|
||||
if (skip_op0 || skip_op1)
|
||||
{
|
||||
/* ??? Should be SSE vector operation cost. */
|
||||
/* At least for published AMD latencies, this really is the same
|
||||
as the latency for a simple fpu operation like fabs. */
|
||||
/* V*QImode is emulated with 1-11 insns. */
|
||||
if (mode == V16QImode || mode == V32QImode)
|
||||
{
|
||||
int count = 11;
|
||||
if (TARGET_XOP && mode == V16QImode)
|
||||
{
|
||||
/* For XOP we use vpshab, which requires a broadcast of the
|
||||
value to the variable shift insn. For constants this
|
||||
means a V16Q const in mem; even when we can perform the
|
||||
shift with one insn set the cost to prefer paddb. */
|
||||
if (CONSTANT_P (XEXP (x, 1)))
|
||||
{
|
||||
*total = ix86_vec_cost (mode,
|
||||
cost->sse_op
|
||||
+ rtx_cost (XEXP (x, 0), mode, code, 0, speed)
|
||||
+ (speed ? 2 : COSTS_N_BYTES (16)), true);
|
||||
return true;
|
||||
}
|
||||
count = 3;
|
||||
}
|
||||
else if (TARGET_SSSE3)
|
||||
count = 7;
|
||||
*total = ix86_vec_cost (mode, cost->sse_op * count, true);
|
||||
}
|
||||
else
|
||||
*total = ix86_vec_cost (mode, cost->sse_op, true);
|
||||
}
|
||||
else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
|
||||
{
|
||||
if (CONST_INT_P (XEXP (x, 1)))
|
||||
{
|
||||
if (INTVAL (XEXP (x, 1)) > 32)
|
||||
*total = cost->shift_const + COSTS_N_INSNS (2);
|
||||
else
|
||||
*total = cost->shift_const * 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (GET_CODE (XEXP (x, 1)) == AND)
|
||||
*total = cost->shift_var * 2;
|
||||
else
|
||||
*total = cost->shift_var * 6 + COSTS_N_INSNS (2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (CONST_INT_P (XEXP (x, 1)))
|
||||
*total = cost->shift_const;
|
||||
else if (SUBREG_P (XEXP (x, 1))
|
||||
&& GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
|
||||
{
|
||||
/* Return the cost after shift-and truncation. */
|
||||
*total = cost->shift_var;
|
||||
return true;
|
||||
}
|
||||
else
|
||||
*total = cost->shift_var;
|
||||
if (!skip_op0)
|
||||
*total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
|
||||
if (!skip_op1)
|
||||
*total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
@ -39519,59 +39632,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
|
||||
}
|
||||
|
||||
case MULT:
|
||||
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
|
||||
{
|
||||
*total = inner_mode == DFmode ? cost->mulsd : cost->mulss;
|
||||
return false;
|
||||
}
|
||||
else if (X87_FLOAT_MODE_P (mode))
|
||||
{
|
||||
*total = cost->fmul;
|
||||
return false;
|
||||
}
|
||||
else if (FLOAT_MODE_P (mode))
|
||||
{
|
||||
*total = ix86_vec_cost (mode,
|
||||
inner_mode == DFmode
|
||||
? cost->mulsd : cost->mulss, true);
|
||||
return false;
|
||||
}
|
||||
else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
|
||||
{
|
||||
/* V*QImode is emulated with 7-13 insns. */
|
||||
if (mode == V16QImode || mode == V32QImode)
|
||||
{
|
||||
int extra = 11;
|
||||
if (TARGET_XOP && mode == V16QImode)
|
||||
extra = 5;
|
||||
else if (TARGET_SSSE3)
|
||||
extra = 6;
|
||||
*total = ix86_vec_cost (mode,
|
||||
cost->mulss * 2 + cost->sse_op * extra,
|
||||
true);
|
||||
}
|
||||
/* V*DImode is emulated with 5-8 insns. */
|
||||
else if (mode == V2DImode || mode == V4DImode)
|
||||
{
|
||||
if (TARGET_XOP && mode == V2DImode)
|
||||
*total = ix86_vec_cost (mode,
|
||||
cost->mulss * 2 + cost->sse_op * 3,
|
||||
true);
|
||||
else
|
||||
*total = ix86_vec_cost (mode,
|
||||
cost->mulss * 3 + cost->sse_op * 5,
|
||||
true);
|
||||
}
|
||||
/* Without sse4.1, we don't have PMULLD; it's emulated with 7
|
||||
insns, including two PMULUDQ. */
|
||||
else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
|
||||
*total = ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
|
||||
true);
|
||||
else
|
||||
*total = ix86_vec_cost (mode, cost->mulss, true);
|
||||
return false;
|
||||
}
|
||||
else
|
||||
if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
|
||||
{
|
||||
rtx op0 = XEXP (x, 0);
|
||||
rtx op1 = XEXP (x, 1);
|
||||
@ -39616,21 +39677,14 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
|
||||
|
||||
return true;
|
||||
}
|
||||
*total = ix86_multiplication_cost (cost, mode);
|
||||
return false;
|
||||
|
||||
case DIV:
|
||||
case UDIV:
|
||||
case MOD:
|
||||
case UMOD:
|
||||
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
|
||||
*total = inner_mode == DFmode ? cost->divsd : cost->divss;
|
||||
else if (X87_FLOAT_MODE_P (mode))
|
||||
*total = cost->fdiv;
|
||||
else if (FLOAT_MODE_P (mode))
|
||||
*total = ix86_vec_cost (mode,
|
||||
inner_mode == DFmode ? cost->divsd : cost->divss,
|
||||
true);
|
||||
else
|
||||
*total = cost->divide[MODE_INDEX (mode)];
|
||||
*total = ix86_division_cost (cost, mode);
|
||||
return false;
|
||||
|
||||
case PLUS:
|
||||
@ -48827,7 +48881,117 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
|
||||
unsigned retval = 0;
|
||||
|
||||
tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
|
||||
int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
|
||||
int stmt_cost = - 1;
|
||||
|
||||
if ((kind == vector_stmt || kind == scalar_stmt)
|
||||
&& stmt_info
|
||||
&& stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
|
||||
{
|
||||
tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
|
||||
bool fp = false;
|
||||
machine_mode mode = TImode;
|
||||
|
||||
if (vectype != NULL)
|
||||
{
|
||||
fp = FLOAT_TYPE_P (vectype);
|
||||
mode = TYPE_MODE (vectype);
|
||||
}
|
||||
/*machine_mode inner_mode = mode;
|
||||
if (VECTOR_MODE_P (mode))
|
||||
inner_mode = GET_MODE_INNER (mode);*/
|
||||
|
||||
switch (subcode)
|
||||
{
|
||||
case PLUS_EXPR:
|
||||
case POINTER_PLUS_EXPR:
|
||||
case MINUS_EXPR:
|
||||
if (kind == scalar_stmt)
|
||||
{
|
||||
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
|
||||
stmt_cost = ix86_cost->addss;
|
||||
else if (X87_FLOAT_MODE_P (mode))
|
||||
stmt_cost = ix86_cost->fadd;
|
||||
else
|
||||
stmt_cost = ix86_cost->add;
|
||||
}
|
||||
else
|
||||
stmt_cost = ix86_vec_cost (mode,
|
||||
fp ? ix86_cost->addss
|
||||
: ix86_cost->sse_op,
|
||||
true);
|
||||
break;
|
||||
|
||||
case MULT_EXPR:
|
||||
case WIDEN_MULT_EXPR:
|
||||
case MULT_HIGHPART_EXPR:
|
||||
stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
|
||||
break;
|
||||
case FMA_EXPR:
|
||||
stmt_cost = ix86_vec_cost (mode,
|
||||
mode == SFmode ? ix86_cost->fmass
|
||||
: ix86_cost->fmasd,
|
||||
true);
|
||||
break;
|
||||
case NEGATE_EXPR:
|
||||
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
|
||||
stmt_cost = ix86_cost->sse_op;
|
||||
else if (X87_FLOAT_MODE_P (mode))
|
||||
stmt_cost = ix86_cost->fchs;
|
||||
else if (VECTOR_MODE_P (mode))
|
||||
stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
|
||||
else
|
||||
stmt_cost = ix86_cost->add;
|
||||
break;
|
||||
case TRUNC_DIV_EXPR:
|
||||
case CEIL_DIV_EXPR:
|
||||
case FLOOR_DIV_EXPR:
|
||||
case ROUND_DIV_EXPR:
|
||||
case TRUNC_MOD_EXPR:
|
||||
case CEIL_MOD_EXPR:
|
||||
case FLOOR_MOD_EXPR:
|
||||
case RDIV_EXPR:
|
||||
case ROUND_MOD_EXPR:
|
||||
case EXACT_DIV_EXPR:
|
||||
stmt_cost = ix86_division_cost (ix86_cost, mode);
|
||||
break;
|
||||
|
||||
case RSHIFT_EXPR:
|
||||
case LSHIFT_EXPR:
|
||||
case LROTATE_EXPR:
|
||||
case RROTATE_EXPR:
|
||||
{
|
||||
tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
|
||||
stmt_cost = ix86_shift_rotate_cost
|
||||
(ix86_cost, mode,
|
||||
TREE_CODE (op2) == INTEGER_CST,
|
||||
cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
|
||||
true, false, false, NULL, NULL);
|
||||
}
|
||||
break;
|
||||
case NOP_EXPR:
|
||||
stmt_cost = 0;
|
||||
break;
|
||||
|
||||
case BIT_IOR_EXPR:
|
||||
case ABS_EXPR:
|
||||
case MIN_EXPR:
|
||||
case MAX_EXPR:
|
||||
case BIT_XOR_EXPR:
|
||||
case BIT_AND_EXPR:
|
||||
case BIT_NOT_EXPR:
|
||||
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
|
||||
stmt_cost = ix86_cost->sse_op;
|
||||
else if (VECTOR_MODE_P (mode))
|
||||
stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
|
||||
else
|
||||
stmt_cost = ix86_cost->add;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (stmt_cost == -1)
|
||||
stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
|
||||
|
||||
/* Penalize DFmode vector operations for Bonnell. */
|
||||
if (TARGET_BONNELL && kind == vector_stmt
|
||||
|
@ -1,3 +1,7 @@
|
||||
2017-11-17 Jan Hubicka <hubicka@ucw.cz>
|
||||
|
||||
* gcc.target/i386/xop-imul64-vector.c: Disable vectorizer costmodel.
|
||||
|
||||
2017-11-17 Jan Hubicka <hubicka@ucw.cz>
|
||||
|
||||
* gcc.dg/tree-ssa/dump-2.c: Fixup template for profile precision
|
||||
|
@ -3,7 +3,7 @@
|
||||
|
||||
/* { dg-do compile } */
|
||||
/* { dg-require-effective-target lp64 } */
|
||||
/* { dg-options "-O2 -mxop -mno-avx2 -ftree-vectorize" } */
|
||||
/* { dg-options "-O2 -mxop -mno-avx2 -ftree-vectorize -fno-vect-cost-model" } */
|
||||
|
||||
extern void exit (int);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user