Rename the tuning option and related functions to enable the Newton series for the reciprocal square root to reflect its approximative characteristic.

gcc/
	* config/aarch64/aarch64-protos.h (aarch64_emit_swrsqrt): Rename
	function to "aarch64_emit_approx_rsqrt".
	* config/aarch64/aarch64-tuning-flags.def: Rename tuning flag to
	AARCH64_EXTRA_TUNE_APPROX_RSQRT.
	* config/aarch64/aarch64.c (exynosm1_tunigs): Use new flag name.
	(xgene1_tunings): Likewise.
	(use_rsqrt_p): Likewise.
	(aarch64_emit_swrsqrt): Use new function name.
	* config/aarch64/aarch64-simd.md (aarch64_rsqrts_*): Likewise.
	* config/aarch64/aarch64.opt (mlow-precision-recip-sqrt): Reword the
	text explaining this option.
	* doc/invoke.texi (-mlow-precision-recip-sqrt): Likewise.

From-SVN: r233772
This commit is contained in:
Evandro Menezes 2016-02-26 23:41:53 +00:00 committed by Evandro Menezes
parent 20ba5f3318
commit 0c30e0f316
7 changed files with 41 additions and 25 deletions

View File

@ -1,3 +1,23 @@
2016-02-26 Evandro Menezes <e.menezes@samsung.com>
Rename the AArch64 tuning option and related functions to enable the
Newton series for the reciprocal square root to reflect its
approximative characteristic.
gcc/
* config/aarch64/aarch64-protos.h (aarch64_emit_swrsqrt): Rename
function to "aarch64_emit_approx_rsqrt".
* config/aarch64/aarch64-tuning-flags.def: Rename tuning flag to
AARCH64_EXTRA_TUNE_APPROX_RSQRT.
* config/aarch64/aarch64.c (exynosm1_tunigs): Use new flag name.
(xgene1_tunings): Likewise.
(use_rsqrt_p): Likewise.
(aarch64_emit_swrsqrt): Use new function name.
* config/aarch64/aarch64-simd.md (aarch64_rsqrts_*): Likewise.
* config/aarch64/aarch64.opt (mlow-precision-recip-sqrt): Reword the
text explaining this option.
* doc/invoke.texi (-mlow-precision-recip-sqrt): Likewise.
2016-02-26 Jakub Jelinek <jakub@redhat.com>
PR target/69969

View File

@ -360,8 +360,7 @@ void aarch64_emit_call_insn (rtx);
void aarch64_register_pragmas (void);
void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void);
void aarch64_emit_swrsqrt (rtx, rtx);
void aarch64_emit_approx_rsqrt (rtx, rtx);
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
@ -413,9 +412,7 @@ rtx aarch64_expand_builtin (tree exp,
machine_mode mode ATTRIBUTE_UNUSED,
int ignore ATTRIBUTE_UNUSED);
tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED);
tree aarch64_builtin_rsqrt (unsigned int);
tree aarch64_builtin_vectorized_function (unsigned int, tree, tree);
extern void aarch64_split_combinev16qi (rtx operands[3]);

View File

@ -405,7 +405,7 @@
UNSPEC_RSQRT))]
"TARGET_SIMD"
{
aarch64_emit_swrsqrt (operands[0], operands[1]);
aarch64_emit_approx_rsqrt (operands[0], operands[1]);
DONE;
})

View File

@ -29,5 +29,5 @@
AARCH64_TUNE_ to give an enum name. */
AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)

View File

@ -538,7 +538,7 @@ static const struct tune_params exynosm1_tunings =
48, /* max_case_values. */
64, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
(AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */
};
static const struct tune_params thunderx_tunings =
@ -586,7 +586,7 @@ static const struct tune_params xgene1_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
(AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */
};
/* Support for fine-grained override of the tuning structures. */
@ -7460,8 +7460,8 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
return aarch64_tune_params.memmov_cost;
}
/* Return true if it is safe and beneficial to use the rsqrt optabs to
optimize 1.0/sqrt. */
/* Return true if it is safe and beneficial to use the approximate rsqrt optabs
to optimize 1.0/sqrt. */
static bool
use_rsqrt_p (void)
@ -7469,12 +7469,12 @@ use_rsqrt_p (void)
return (!flag_trapping_math
&& flag_unsafe_math_optimizations
&& ((aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_RECIP_SQRT)
& AARCH64_EXTRA_TUNE_APPROX_RSQRT)
|| flag_mrecip_low_precision_sqrt));
}
/* Function to decide when to use
reciprocal square root builtins. */
/* Function to decide when to use the approximate reciprocal square root
builtin. */
static tree
aarch64_builtin_reciprocal (tree fndecl)
@ -7522,12 +7522,12 @@ get_rsqrts_type (machine_mode mode)
}
}
/* Emit instruction sequence to compute
reciprocal square root. Use two Newton-Raphson steps
for single precision and three for double precision. */
/* Emit instruction sequence to compute the reciprocal square root using the
Newton-Raphson series. Iterate over the series twice for SF
and thrice for DF. */
void
aarch64_emit_swrsqrt (rtx dst, rtx src)
aarch64_emit_approx_rsqrt (rtx dst, rtx src)
{
machine_mode mode = GET_MODE (src);
gcc_assert (
@ -7544,6 +7544,7 @@ aarch64_emit_swrsqrt (rtx dst, rtx src)
int iterations = double_mode ? 3 : 2;
/* Optionally iterate over the series one less time than otherwise. */
if (flag_mrecip_low_precision_sqrt)
iterations--;

View File

@ -151,5 +151,5 @@ PC relative literal loads.
mlow-precision-recip-sqrt
Common Var(flag_mrecip_low_precision_sqrt) Optimization
When calculating a sqrt approximation, run fewer steps.
This reduces precision, but can result in faster computation.
When calculating the reciprocal square root approximation,
uses one less step than otherwise, thus reducing latency and precision.

View File

@ -12884,12 +12884,10 @@ corresponding flag to the linker.
@item -mno-low-precision-recip-sqrt
@opindex -mlow-precision-recip-sqrt
@opindex -mno-low-precision-recip-sqrt
The square root estimate uses two steps instead of three for double-precision,
and one step instead of two for single-precision.
Thus reducing latency and precision.
This is only relevant if @option{-ffast-math} activates
reciprocal square root estimate instructions.
Which in turn depends on the target processor.
When calculating the reciprocal square root approximation,
uses one less step than otherwise, thus reducing latency and precision.
This is only relevant if @option{-ffast-math} enables the reciprocal square root
approximation, which in turn depends on the target processor.
@item -march=@var{name}
@opindex march