[aarch64] Fix target/95969: __builtin_aarch64_im_lane_boundsi interferes with gimple

This patch adds simple folding of __builtin_aarch64_im_lane_boundsi where
we are not going to error out. It fixes the problem by the removal
of the function from the IR.

OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions.

gcc/ChangeLog:

	PR target/95969
	* config/aarch64/aarch64-builtins.c (aarch64_fold_builtin_lane_check):
	New function.
	(aarch64_general_fold_builtin): Handle AARCH64_SIMD_BUILTIN_LANE_CHECK.
	(aarch64_general_gimple_fold_builtin): Likewise.

gcc/testsuite/ChangeLog:

	PR target/95969
	* gcc.target/aarch64/lane-bound-1.c: New test.
	* gcc.target/aarch64/lane-bound-2.c: New test.
This commit is contained in:
Andrew Pinski 2021-09-02 07:08:22 +00:00
parent 20f3c16820
commit 03312cbd54
3 changed files with 59 additions and 0 deletions

View File

@ -29,6 +29,7 @@
#include "rtl.h"
#include "tree.h"
#include "gimple.h"
#include "ssa.h"
#include "memmodel.h"
#include "tm_p.h"
#include "expmed.h"
@ -2333,6 +2334,27 @@ aarch64_general_builtin_rsqrt (unsigned int fn)
return NULL_TREE;
}
/* Return true if the lane check can be removed as there is no
error going to be emitted. */
static bool
aarch64_fold_builtin_lane_check (tree arg0, tree arg1, tree arg2)
{
if (TREE_CODE (arg0) != INTEGER_CST)
return false;
if (TREE_CODE (arg1) != INTEGER_CST)
return false;
if (TREE_CODE (arg2) != INTEGER_CST)
return false;
auto totalsize = wi::to_widest (arg0);
auto elementsize = wi::to_widest (arg1);
if (totalsize == 0 || elementsize == 0)
return false;
auto lane = wi::to_widest (arg2);
auto high = wi::udiv_trunc (totalsize, elementsize);
return wi::ltu_p (lane, high);
}
#undef VAR1
#define VAR1(T, N, MAP, FLAG, A) \
case AARCH64_SIMD_BUILTIN_##T##_##N##A:
@ -2353,6 +2375,11 @@ aarch64_general_fold_builtin (unsigned int fcode, tree type,
VAR1 (UNOP, floatv4si, 2, ALL, v4sf)
VAR1 (UNOP, floatv2di, 2, ALL, v2df)
return fold_build1 (FLOAT_EXPR, type, args[0]);
case AARCH64_SIMD_BUILTIN_LANE_CHECK:
gcc_assert (n_args == 3);
if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2]))
return void_node;
break;
default:
break;
}
@ -2440,6 +2467,14 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt)
}
break;
}
case AARCH64_SIMD_BUILTIN_LANE_CHECK:
if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2]))
{
unlink_stmt_vdef (stmt);
release_defs (stmt);
new_stmt = gimple_build_nop ();
}
break;
default:
break;
}

View File

@ -0,0 +1,14 @@
/* { dg-do compile } */
/* { dg-options "-O2 -fdump-tree-optimized" } */
#include <arm_neon.h>
void
f (float32x4_t **ptr)
{
float32x4_t res = vsetq_lane_f32 (0.0f, **ptr, 0);
**ptr = res;
}
/* GCC should be able to remove the call to "__builtin_aarch64_im_lane_boundsi"
and optimize out the second load from *ptr. */
/* { dg-final { scan-tree-dump-times "__builtin_aarch64_im_lane_boundsi" 0 "optimized" } } */
/* { dg-final { scan-tree-dump-times " = \\\*ptr_" 1 "optimized" } } */

View File

@ -0,0 +1,10 @@
/* { dg-do compile } */
/* { dg-options "-O2 -fdump-tree-original" } */
void
f (void)
{
__builtin_aarch64_im_lane_boundsi (16, 4, 0);
__builtin_aarch64_im_lane_boundsi (8, 8, 0);
}
/* GCC should be able to optimize these out before gimplification. */
/* { dg-final { scan-tree-dump-times "__builtin_aarch64_im_lane_boundsi" 0 "original" } } */