[AArch64] Emit division using the Newton series
2016-06-13 Evandro Menezes <e.menezes@samsung.com> Wilco Dijkstra <Wilco.Dijkstra@arm.com> gcc/ * config/aarch64/aarch64-protos.h (cpu_approx_modes): Add new member "division". (aarch64_emit_approx_div): Declare new function. * config/aarch64/aarch64.c (generic_approx_modes): New member "division". (exynosm1_approx_modes): Likewise. (xgene1_approx_modes): Likewise. (aarch64_emit_approx_div): Define new function. * config/aarch64/aarch64.md ("div<mode>3"): New expansion. * config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise. * config/aarch64/aarch64.opt (-mlow-precision-div): Add new option. * doc/invoke.texi (-mlow-precision-div): Describe new option. From-SVN: r237397
This commit is contained in:
parent
98daafa0b3
commit
79a2bc2dc9
@ -192,6 +192,7 @@ struct cpu_branch_cost
|
||||
/* Allowed modes for approximations. */
|
||||
struct cpu_approx_modes
|
||||
{
|
||||
const unsigned int division; /* Division. */
|
||||
const unsigned int sqrt; /* Square root. */
|
||||
const unsigned int recip_sqrt; /* Reciprocal square root. */
|
||||
};
|
||||
@ -303,6 +304,7 @@ int aarch64_branch_cost (bool, bool);
|
||||
enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
|
||||
bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
|
||||
bool aarch64_constant_address_p (rtx);
|
||||
bool aarch64_emit_approx_div (rtx, rtx, rtx);
|
||||
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
|
||||
bool aarch64_expand_movmem (rtx *);
|
||||
bool aarch64_float_const_zero_rtx_p (rtx);
|
||||
|
@ -1500,7 +1500,19 @@
|
||||
[(set_attr "type" "neon_fp_mul_<Vetype><q>")]
|
||||
)
|
||||
|
||||
(define_insn "div<mode>3"
|
||||
(define_expand "div<mode>3"
|
||||
[(set (match_operand:VDQF 0 "register_operand")
|
||||
(div:VDQF (match_operand:VDQF 1 "general_operand")
|
||||
(match_operand:VDQF 2 "register_operand")))]
|
||||
"TARGET_SIMD"
|
||||
{
|
||||
if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
|
||||
DONE;
|
||||
|
||||
operands[1] = force_reg (<MODE>mode, operands[1]);
|
||||
})
|
||||
|
||||
(define_insn "*div<mode>3"
|
||||
[(set (match_operand:VDQF 0 "register_operand" "=w")
|
||||
(div:VDQF (match_operand:VDQF 1 "register_operand" "w")
|
||||
(match_operand:VDQF 2 "register_operand" "w")))]
|
||||
|
@ -396,6 +396,7 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
|
||||
/* Generic approximation modes. */
|
||||
static const cpu_approx_modes generic_approx_modes =
|
||||
{
|
||||
AARCH64_APPROX_NONE, /* division */
|
||||
AARCH64_APPROX_NONE, /* sqrt */
|
||||
AARCH64_APPROX_NONE /* recip_sqrt */
|
||||
};
|
||||
@ -403,6 +404,7 @@ static const cpu_approx_modes generic_approx_modes =
|
||||
/* Approximation modes for Exynos M1. */
|
||||
static const cpu_approx_modes exynosm1_approx_modes =
|
||||
{
|
||||
AARCH64_APPROX_NONE, /* division */
|
||||
AARCH64_APPROX_ALL, /* sqrt */
|
||||
AARCH64_APPROX_ALL /* recip_sqrt */
|
||||
};
|
||||
@ -410,6 +412,7 @@ static const cpu_approx_modes exynosm1_approx_modes =
|
||||
/* Approximation modes for X-Gene 1. */
|
||||
static const cpu_approx_modes xgene1_approx_modes =
|
||||
{
|
||||
AARCH64_APPROX_NONE, /* division */
|
||||
AARCH64_APPROX_NONE, /* sqrt */
|
||||
AARCH64_APPROX_ALL /* recip_sqrt */
|
||||
};
|
||||
@ -7488,6 +7491,95 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
|
||||
return true;
|
||||
}
|
||||
|
||||
typedef rtx (*recpe_type) (rtx, rtx);
|
||||
|
||||
/* Select reciprocal initial estimate insn depending on machine mode. */
|
||||
|
||||
static recpe_type
|
||||
get_recpe_type (machine_mode mode)
|
||||
{
|
||||
switch (mode)
|
||||
{
|
||||
case SFmode: return (gen_aarch64_frecpesf);
|
||||
case V2SFmode: return (gen_aarch64_frecpev2sf);
|
||||
case V4SFmode: return (gen_aarch64_frecpev4sf);
|
||||
case DFmode: return (gen_aarch64_frecpedf);
|
||||
case V2DFmode: return (gen_aarch64_frecpev2df);
|
||||
default: gcc_unreachable ();
|
||||
}
|
||||
}
|
||||
|
||||
typedef rtx (*recps_type) (rtx, rtx, rtx);
|
||||
|
||||
/* Select reciprocal series step insn depending on machine mode. */
|
||||
|
||||
static recps_type
|
||||
get_recps_type (machine_mode mode)
|
||||
{
|
||||
switch (mode)
|
||||
{
|
||||
case SFmode: return (gen_aarch64_frecpssf);
|
||||
case V2SFmode: return (gen_aarch64_frecpsv2sf);
|
||||
case V4SFmode: return (gen_aarch64_frecpsv4sf);
|
||||
case DFmode: return (gen_aarch64_frecpsdf);
|
||||
case V2DFmode: return (gen_aarch64_frecpsv2df);
|
||||
default: gcc_unreachable ();
|
||||
}
|
||||
}
|
||||
|
||||
/* Emit the instruction sequence to compute the approximation for the division
|
||||
of NUM by DEN in QUO and return whether the sequence was emitted or not. */
|
||||
|
||||
bool
|
||||
aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
|
||||
{
|
||||
machine_mode mode = GET_MODE (quo);
|
||||
bool use_approx_division_p = (flag_mlow_precision_div
|
||||
|| (aarch64_tune_params.approx_modes->division
|
||||
& AARCH64_APPROX_MODE (mode)));
|
||||
|
||||
if (!flag_finite_math_only
|
||||
|| flag_trapping_math
|
||||
|| !flag_unsafe_math_optimizations
|
||||
|| optimize_function_for_size_p (cfun)
|
||||
|| !use_approx_division_p)
|
||||
return false;
|
||||
|
||||
/* Estimate the approximate reciprocal. */
|
||||
rtx xrcp = gen_reg_rtx (mode);
|
||||
emit_insn ((*get_recpe_type (mode)) (xrcp, den));
|
||||
|
||||
/* Iterate over the series twice for SF and thrice for DF. */
|
||||
int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
|
||||
|
||||
/* Optionally iterate over the series once less for faster performance,
|
||||
while sacrificing the accuracy. */
|
||||
if (flag_mlow_precision_div)
|
||||
iterations--;
|
||||
|
||||
/* Iterate over the series to calculate the approximate reciprocal. */
|
||||
rtx xtmp = gen_reg_rtx (mode);
|
||||
while (iterations--)
|
||||
{
|
||||
emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
|
||||
|
||||
if (iterations > 0)
|
||||
emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
|
||||
}
|
||||
|
||||
if (num != CONST1_RTX (mode))
|
||||
{
|
||||
/* As the approximate reciprocal of DEN is already calculated, only
|
||||
calculate the approximate division when NUM is not 1.0. */
|
||||
rtx xnum = force_reg (mode, num);
|
||||
emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
|
||||
}
|
||||
|
||||
/* Finalize the approximation. */
|
||||
emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Return the number of instructions that can be issued per cycle. */
|
||||
static int
|
||||
aarch64_sched_issue_rate (void)
|
||||
|
@ -4715,11 +4715,22 @@
|
||||
[(set_attr "type" "fmul<s>")]
|
||||
)
|
||||
|
||||
(define_insn "div<mode>3"
|
||||
(define_expand "div<mode>3"
|
||||
[(set (match_operand:GPF 0 "register_operand")
|
||||
(div:GPF (match_operand:GPF 1 "general_operand")
|
||||
(match_operand:GPF 2 "register_operand")))]
|
||||
"TARGET_SIMD"
|
||||
{
|
||||
if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
|
||||
DONE;
|
||||
|
||||
operands[1] = force_reg (<MODE>mode, operands[1]);
|
||||
})
|
||||
|
||||
(define_insn "*div<mode>3"
|
||||
[(set (match_operand:GPF 0 "register_operand" "=w")
|
||||
(div:GPF
|
||||
(match_operand:GPF 1 "register_operand" "w")
|
||||
(match_operand:GPF 2 "register_operand" "w")))]
|
||||
(div:GPF (match_operand:GPF 1 "register_operand" "w")
|
||||
(match_operand:GPF 2 "register_operand" "w")))]
|
||||
"TARGET_FLOAT"
|
||||
"fdiv\\t%<s>0, %<s>1, %<s>2"
|
||||
[(set_attr "type" "fdiv<s>")]
|
||||
|
@ -161,3 +161,9 @@ Enable the square root approximation. Enabling this reduces
|
||||
precision of square root results to about 16 bits for
|
||||
single precision and to 32 bits for double precision.
|
||||
If enabled, it implies -mlow-precision-recip-sqrt.
|
||||
|
||||
mlow-precision-div
|
||||
Common Var(flag_mlow_precision_div) Optimization
|
||||
Enable the division approximation. Enabling this reduces
|
||||
precision of division results to about 16 bits for
|
||||
single precision and to 32 bits for double precision.
|
||||
|
@ -577,6 +577,7 @@ Objective-C and Objective-C++ Dialects}.
|
||||
-mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol
|
||||
-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
|
||||
-mlow-precision-sqrt -mno-low-precision-sqrt@gol
|
||||
-mlow-precision-div -mno-low-precision-div @gol
|
||||
-march=@var{name} -mcpu=@var{name} -mtune=@var{name}}
|
||||
|
||||
@emph{Adapteva Epiphany Options}
|
||||
@ -13040,6 +13041,16 @@ precision of square root results to about 16 bits for
|
||||
single precision and to 32 bits for double precision.
|
||||
If enabled, it implies @option{-mlow-precision-recip-sqrt}.
|
||||
|
||||
@item -mlow-precision-div
|
||||
@item -mno-low-precision-div
|
||||
@opindex -mlow-precision-div
|
||||
@opindex -mno-low-precision-div
|
||||
Enable or disable the division approximation.
|
||||
This option only has an effect if @option{-ffast-math} or
|
||||
@option{-funsafe-math-optimizations} is used as well. Enabling this reduces
|
||||
precision of division results to about 16 bits for
|
||||
single precision and to 32 bits for double precision.
|
||||
|
||||
@item -march=@var{name}
|
||||
@opindex march
|
||||
Specify the name of the target architecture and, optionally, one or
|
||||
|
Loading…
Reference in New Issue
Block a user