MSP430: Implement TARGET_RTX_COSTS

Costs of MSP430 instructions are mostly just a function of the type and
number of operands; knowledge of the specific instruction often
isn't required to calculate the cost.
In these cases, TARGET_RTX_COSTS just needs to examine the operands to
calculate the cost of the expression.

For more complicated operations where library helper functions are
required, if the cost cannot be accurately calculated, it is estimated
and disparaged relative to the cost of a single instruction.

gcc/ChangeLog:

	* config/msp430/msp430.c (use_helper_for_const_shift): Add forward
	declaration.
	Remove unused argument.
	(struct msp430_multlib_costs): New struct.
	(msp430_is_mem_indirect): New function.
	(msp430_costs): Likewise.
	(msp430_shift_costs): Likewise.
	(msp430_muldiv_costs): Likewise.
	(msp430_get_inner_dest_code): Likewise.
	(msp430_single_op_cost): Likewise.
	(msp430_rtx_costs): Rewrite from scratch.
	(msp430_expand_shift): Adjust use_helper_for_const_shift call.
This commit is contained in:
Jozef Lawrynowicz 2020-11-13 15:35:42 +00:00
parent 953587a2b0
commit f62dd39823
1 changed files with 485 additions and 17 deletions

View File

@ -49,6 +49,9 @@
#include "msp430-devices.h"
#include "incpath.h"
#include "prefix.h"
#include "insn-config.h"
#include "insn-attr.h"
#include "recog.h"
/* This file should be included last. */
#include "target-def.h"
@ -56,6 +59,7 @@
static void msp430_compute_frame_info (void);
static bool use_32bit_hwmult (void);
static bool use_helper_for_const_shift (machine_mode mode, HOST_WIDE_INT amt);
@ -1118,6 +1122,28 @@ static const struct double_op_cost size_cost_double_op =
2, 2, 3
};
struct msp430_multlib_costs
{
const int mulhi;
const int mulsi;
const int muldi;
};
/* There is no precise size cost when using libcalls, instead it is disparaged
relative to other instructions.
The cycle costs are from the CALL to the RET, inclusive.
FIXME muldi cost is not accurate. */
static const struct msp430_multlib_costs cycle_cost_multlib_32bit =
{
27, 33, 66
};
/* 32bit multiply takes a few more instructions on 16bit hwmult. */
static const struct msp430_multlib_costs cycle_cost_multlib_16bit =
{
27, 42, 66
};
/* TARGET_REGISTER_MOVE_COST
There is only one class of general-purpose, non-fixed registers, and the
relative cost of moving data between them is always the same.
@ -1160,29 +1186,469 @@ msp430_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
because there are no conditional move insns - when a condition is involved,
the only option is to use a cbranch. */
/* For X, which must be a MEM RTX, return TRUE if it is an indirect memory
reference, @Rn or @Rn+. */
static bool
msp430_is_mem_indirect (rtx x)
{
gcc_assert (GET_CODE (x) == MEM);
rtx op0 = XEXP (x, 0);
return (GET_CODE (op0) == REG || GET_CODE (op0) == POST_INC);
}
/* Costs of MSP430 instructions are generally based on the addressing mode
combination of the source and destination operands.
Given source operand SRC (which may be NULL to indicate a single-operand
instruction) and destination operand DST return the cost of this
expression. */
static int
msp430_costs (rtx src, rtx dst, bool speed, rtx outer_rtx)
{
enum rtx_code src_code = GET_CODE (src);
enum rtx_code dst_code = GET_CODE (dst);
enum rtx_code outer_code = GET_CODE (outer_rtx);
machine_mode outer_mode = GET_MODE (outer_rtx);
const struct double_op_cost *cost_p;
cost_p = (speed ? &cycle_cost_double_op : &size_cost_double_op);
if (outer_code == TRUNCATE
&& (outer_mode == QImode
|| outer_mode == HImode
|| outer_mode == PSImode))
/* Truncation to these modes is normally free as a side effect of the
instructions themselves. */
return 0;
if (dst_code == SYMBOL_REF
|| dst_code == LABEL_REF
|| dst_code == CONST_INT)
/* Catch RTX like (minus (const_int 0) (reg)) but don't add any cost. */
return 0;
switch (src_code)
{
case REG:
return (dst_code == REG ? cost_p->r2r
: (dst_code == PC ? cost_p->r2pc : cost_p->r2m));
case CONST_INT:
case SYMBOL_REF:
case LABEL_REF:
case CONST:
return (dst_code == REG ? cost_p->imm2r
: (dst_code == PC ? cost_p->imm2pc : cost_p->imm2m));
case MEM:
if (msp430_is_mem_indirect (src))
return (dst_code == REG ? cost_p->ind2r : (dst_code == PC
? cost_p->ind2pc
: cost_p->ind2m));
else
return (dst_code == REG ? cost_p->mem2r : (dst_code == PC
? cost_p->mem2pc
: cost_p->mem2m));
default:
return cost_p->mem2m;
}
}
/* Given source operand SRC and destination operand DST from the shift or
rotate RTX OUTER_RTX, return the cost of performing that shift, assuming
optimization for speed when SPEED is true. */
static int
msp430_shift_costs (rtx src, rtx dst, bool speed, rtx outer_rtx)
{
int amt;
enum rtx_code src_code = GET_CODE (src);
enum rtx_code dst_code = GET_CODE (dst);
const struct single_op_cost *cost_p;
cost_p = (speed ? &cycle_cost_single_op : &size_cost_single_op);
if (src_code != CONST_INT)
/* The size or speed cost when the shift amount is unknown cannot be
accurately calculated, so just disparage it slightly. */
return 2 * msp430_costs (src, dst, speed, outer_rtx);
if (use_helper_for_const_shift (GET_MODE (outer_rtx), amt = INTVAL (src)))
{
/* GCC sometimes tries to perform shifts in some very inventive ways,
resulting in much larger code size usage than necessary, if
they are disparaged too much here. So in general, if
use_helper_for_const_shift thinks a helper should be used, obey
that and don't disparage the shift any more than a regular
instruction, even though the shift may actually cost more.
This ensures that the RTL generated at the initial expand pass has the
expected shift instructions, which can be mapped to the helper
functions. */
return msp430_costs (src, dst, speed, outer_rtx);
}
if (!msp430x)
{
/* Each shift by one place will be emitted individually. */
switch (dst_code)
{
case REG:
case CONST_INT:
return amt * cost_p->reg;
case MEM:
if (msp430_is_mem_indirect (dst))
return amt * cost_p->ind;
else
return amt * cost_p->mem;
default:
return amt * cost_p->mem;
}
}
/* RRAM, RRCM, RRUM, RLAM are used for shift counts <= 4, otherwise, the 'X'
versions are used.
Instructions which shift a MEM operand will never actually be output. It
will always be copied into a register to allow for efficient shifting. So
the cost just takes into account the cost of an additional copy in that
case. */
return (amt <= 4 ? (speed ? amt : 1) : (speed ? amt + 1 : 2)
+ (dst_code == REG ? 0
: msp430_costs (dst, gen_rtx_REG (HImode, 10), speed, outer_rtx)));
}
/* Given source operand SRC and destination operand DST from the MULT/DIV/MOD
RTX OUTER_RTX, return the cost of performing that operation, assuming
optimization for speed when SPEED is true. */
static int
msp430_muldiv_costs (rtx src, rtx dst, bool speed, rtx outer_rtx,
machine_mode outer_mode)
{
enum rtx_code outer_code = GET_CODE (outer_rtx);
const struct msp430_multlib_costs *cost_p;
bool hwmult_16bit = (msp430_has_hwmult () && !(msp430_use_f5_series_hwmult ()
|| use_32bit_hwmult ()));
cost_p = (hwmult_16bit
? &cycle_cost_multlib_32bit
: &cycle_cost_multlib_16bit);
int factor = 1;
/* Only used in some calculations. */
int mode_factor = 1;
if (outer_mode == SImode)
mode_factor = 2;
else if (outer_mode == PSImode)
/* PSImode multiplication is performed using SImode operands, so has extra
cost to factor in the conversions necessary before/after the
operation. */
mode_factor = 3;
else if (outer_mode == DImode)
mode_factor = 4;
if (!speed)
{
/* The codesize cost of using a helper function to perform the
multiplication or division cannot be accurately calculated, since the
cost depends on how many times the operation is performed in the
entire program. */
if (outer_code != MULT)
/* Division is always expensive. */
factor = 7;
else if (((hwmult_16bit && outer_mode != DImode)
|| use_32bit_hwmult () || msp430_use_f5_series_hwmult ()))
/* When the hardware multiplier is available, only disparage
slightly. */
factor = 2;
else
factor = 5;
return factor * mode_factor * msp430_costs (src, dst, speed, outer_rtx);
}
/* When there is hardware multiply support, there is a relatively low, fixed
cycle cost to performing any multiplication, but when there is no hardware
multiply support it is very costly. That precise cycle cost has not been
calculated here.
Division is extra slow since it always uses a software library.
The 16-bit hardware multiply library cannot be used to produce 64-bit
results. */
if (outer_code != MULT || !msp430_has_hwmult ()
|| (outer_mode == DImode && hwmult_16bit))
{
factor = (outer_code == MULT ? 50 : 70);
return factor * mode_factor * msp430_costs (src, dst, speed, outer_rtx);
}
switch (outer_mode)
{
case E_QImode:
case E_HImode:
/* Include the cost of copying the operands into and out of the hardware
multiply routine. */
return cost_p->mulhi + (3 * msp430_costs (src, dst, speed, outer_rtx));
case E_PSImode:
/* Extra factor for the conversions necessary to do PSI->SI before the
operation. */
factor = 2;
/* fallthru. */
case E_SImode:
return factor * (cost_p->mulsi
+ (6 * msp430_costs (src, dst, speed, outer_rtx)));
case E_DImode:
default:
return cost_p->muldi + (12 * msp430_costs (src, dst, speed, outer_rtx));
}
}
/* Recurse within X to find the actual destination operand of the expression.
For example:
(plus (ashift (minus (ashift (reg)
(const_int) ......
should return the reg RTX. */
static rtx
msp430_get_inner_dest_code (rtx x)
{
enum rtx_code code = GET_CODE (x);
rtx op0 = XEXP (x, 0);
switch (code)
{
case REG:
case SYMBOL_REF:
case CONST_INT:
case CONST:
case LABEL_REF:
return x;
case MEM:
/* Return the MEM expr not the inner REG for these cases. */
switch (GET_CODE (op0))
{
case REG:
case SYMBOL_REF:
case LABEL_REF:
case CONST:
case POST_INC:
return x;
case PLUS:
/* return MEM (PLUS (REG) (CONST)) */
if (GET_CODE (XEXP (op0, 0)) == REG)
{
if (GET_CODE (XEXP (op0, 1)) == CONST_INT
|| GET_CODE (XEXP (op0, 1)) == CONST
|| GET_CODE (XEXP (op0, 1)) == LABEL_REF
|| GET_CODE (XEXP (op0, 1)) == SYMBOL_REF)
return x;
else
return msp430_get_inner_dest_code (op0);
}
return msp430_get_inner_dest_code (op0);
default:
if (GET_RTX_FORMAT (code)[0] != 'e')
return x;
return msp430_get_inner_dest_code (op0);
}
break;
default:
if (op0 == NULL_RTX)
gcc_unreachable ();
else
{
if (GET_RTX_FORMAT (code)[0] != 'e'
&& code != ENTRY_VALUE)
return x;
return msp430_get_inner_dest_code (op0);
}
}
}
/* Calculate the cost of an MSP430 single-operand instruction, for operand DST
within the RTX OUTER_RTX, optimizing for speed if SPEED is true. */
static int
msp430_single_op_cost (rtx dst, bool speed, rtx outer_rtx)
{
enum rtx_code dst_code = GET_CODE (dst);
const struct single_op_cost *cost_p;
const struct double_op_cost *double_op_cost_p;
cost_p = (speed ? &cycle_cost_single_op : &size_cost_single_op);
double_op_cost_p = (speed ? &cycle_cost_double_op : &size_cost_double_op);
switch (dst_code)
{
case REG:
return cost_p->reg;
case MEM:
if (msp430_is_mem_indirect (dst))
return cost_p->ind;
else
return cost_p->mem;
case CONST_INT:
case CONST_FIXED:
case CONST_DOUBLE:
case SYMBOL_REF:
case CONST:
/* A constant value would need to be copied into a register first. */
return double_op_cost_p->imm2r + cost_p->reg;
default:
return cost_p->mem;
}
}
#undef TARGET_RTX_COSTS
#define TARGET_RTX_COSTS msp430_rtx_costs
static bool msp430_rtx_costs (rtx x ATTRIBUTE_UNUSED,
machine_mode mode,
int outer_code ATTRIBUTE_UNUSED,
int opno ATTRIBUTE_UNUSED,
int * total,
bool speed ATTRIBUTE_UNUSED)
/* This target hook describes the relative costs of RTL expressions.
The function recurses to just before the lowest level of the expression,
when both of the operands of the expression can be examined at the same time.
This is because the cost of the expression depends on the specific
addressing mode combination of the operands.
The hook returns true when all subexpressions of X have been processed, and
false when rtx_cost should recurse. */
static bool
msp430_rtx_costs (rtx x,
machine_mode mode,
int outer_code ATTRIBUTE_UNUSED,
int opno ATTRIBUTE_UNUSED,
int * total,
bool speed)
{
int code = GET_CODE (x);
enum rtx_code code = GET_CODE (x);
rtx dst, src;
rtx dst_inner, src_inner;
*total = 0;
dst = XEXP (x, 0);
if (GET_RTX_LENGTH (code) == 1)
/* Some RTX that are single-op in GCC are double-op when translated to
MSP430 instructions e.g NOT, NEG, ZERO_EXTEND. */
src = dst;
else
src = XEXP (x, 1);
switch (code)
{
case SET:
/* Ignoring SET improves codesize. */
if (!speed)
return true;
/* fallthru. */
case PLUS:
if (outer_code == MEM)
/* Do not add any cost for the plus itself, but recurse in case there
are more complicated RTX inside. */
return false;
/* fallthru. */
case MINUS:
case AND:
case IOR:
case XOR:
case NOT:
case ZERO_EXTEND:
case TRUNCATE:
case NEG:
case ZERO_EXTRACT:
case SIGN_EXTRACT:
case IF_THEN_ELSE:
dst_inner = msp430_get_inner_dest_code (dst);
src_inner = msp430_get_inner_dest_code (src);
*total = COSTS_N_INSNS (msp430_costs (src_inner, dst_inner, speed, x));
if (mode == SImode)
*total *= 2;
if (mode == DImode)
*total *= 4;
return false;
case ROTATE:
case ASHIFT:
case ASHIFTRT:
case LSHIFTRT:
dst_inner = msp430_get_inner_dest_code (dst);
src_inner = msp430_get_inner_dest_code (src);
*total = COSTS_N_INSNS (msp430_shift_costs (src_inner, dst_inner,
speed, x));
if (mode == SImode)
*total *= 2;
if (mode == DImode)
*total *= 4;
return false;
case MULT:
case DIV:
case MOD:
case UDIV:
case UMOD:
dst_inner = msp430_get_inner_dest_code (dst);
src_inner = msp430_get_inner_dest_code (src);
*total = COSTS_N_INSNS (msp430_muldiv_costs (src_inner, dst_inner, speed,
x, mode));
return false;
case CALL:
case SIGN_EXTEND:
if (mode == SImode && outer_code == SET)
{
*total = COSTS_N_INSNS (4);
return true;
}
break;
dst_inner = msp430_get_inner_dest_code (dst);
*total = COSTS_N_INSNS (msp430_single_op_cost (dst_inner, speed, x));
if (mode == SImode)
*total *= 2;
if (mode == DImode)
*total *= 4;
return false;
case CONST_INT:
case CONST_FIXED:
case CONST_DOUBLE:
case SYMBOL_REF:
case CONST:
case LABEL_REF:
case REG:
case PC:
case POST_INC:
if (mode == SImode)
*total = COSTS_N_INSNS (2);
else if (mode == DImode)
*total = COSTS_N_INSNS (4);
return true;
case MEM:
/* PSImode operands are expensive when in memory. */
if (mode == PSImode)
*total = COSTS_N_INSNS (1);
else if (mode == SImode)
*total = COSTS_N_INSNS (2);
else if (mode == DImode)
*total = COSTS_N_INSNS (4);
/* Recurse into the MEM. */
return false;
case EQ:
case NE:
case GT:
case GTU:
case GE:
case GEU:
case LT:
case LTU:
case LE:
case LEU:
/* Conditions are mostly equivalent, changing their relative
costs has no effect. */
return false;
case ASM_OPERANDS:
case ASM_INPUT:
case CLOBBER:
case COMPARE:
case CONCAT:
case ENTRY_VALUE:
/* Other unhandled expressions. */
return false;
default:
return false;
}
return false;
}
/* Function Entry and Exit */
@ -2947,8 +3413,7 @@ msp430_expand_helper (rtx *operands, const char *helper_name,
/* Return TRUE if the helper function should be used and FALSE if the shifts
insns should be emitted inline. */
static bool
use_helper_for_const_shift (enum rtx_code code, machine_mode mode,
HOST_WIDE_INT amt)
use_helper_for_const_shift (machine_mode mode, HOST_WIDE_INT amt)
{
const int default_inline_shift = 4;
/* We initialize the option to 65 so we know if the user set it or not. */
@ -2959,6 +3424,9 @@ use_helper_for_const_shift (enum rtx_code code, machine_mode mode,
the heuristic accordingly. */
int max_inline_32 = max_inline / 2;
if (mode == E_DImode)
return true;
/* Don't use helpers for these modes on 430X, when optimizing for speed, or
when emitting a small number of insns. */
if ((mode == E_QImode || mode == E_HImode || mode == E_PSImode)
@ -2996,7 +3464,7 @@ msp430_expand_shift (enum rtx_code code, machine_mode mode, rtx *operands)
constant. */
if (!CONST_INT_P (operands[2])
|| mode == E_DImode
|| use_helper_for_const_shift (code, mode, INTVAL (operands[2])))
|| use_helper_for_const_shift (mode, INTVAL (operands[2])))
{
const char *helper_name = NULL;
/* The const variants of mspabi shifts have significantly larger code