4a9f2306cb
VEC_PERM_EXPR requires the number of MASK elements must be the same with the number of elements in operands V0 and V1. In some cases, like with Power altivec built-in function vec_perm, VIEW_CONVERT_EXPR has to be used to guarantee this requirement, but it can prevent some simplifications which don't consider this well. For the cases that the permutated operands of vector permutation are from two same type CTOR and CTOR, or one CTOR and one VECTOR CST, this patch is to enhance forwprop to look through intermediate VIEW_CONVERT_EXPR and further simplify them if possible. Bootstrapped/regtested on powerpc64le-linux-gnu P9, powerpc64-linux-gnu P8, x86_64-redhat-linux and aarch64-linux-gnu. gcc/ChangeLog: PR tree-optimization/99398 * tree-ssa-forwprop.c (simplify_permutation): Optimize some cases where the fed operands are CTOR/CST and propagated through VIEW_CONVERT_EXPR. Call vec_perm_indices::new_shrunk_vector. * vec-perm-indices.c (vec_perm_indices::new_shrunk_vector): New function. * vec-perm-indices.h (vec_perm_indices::new_shrunk_vector): New declare. gcc/testsuite/ChangeLog: PR tree-optimization/99398 * gcc.target/powerpc/vec-perm-ctor-run.c: New test. * gcc.target/powerpc/vec-perm-ctor.c: New test. * gcc.target/powerpc/vec-perm-ctor.h: New test.
383 lines
13 KiB
C
383 lines
13 KiB
C
/* A representation of vector permutation indices.
|
|
Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
|
|
|
This file is part of GCC.
|
|
|
|
GCC is free software; you can redistribute it and/or modify it under
|
|
the terms of the GNU General Public License as published by the Free
|
|
Software Foundation; either version 3, or (at your option) any later
|
|
version.
|
|
|
|
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with GCC; see the file COPYING3. If not see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include "config.h"
|
|
#include "system.h"
|
|
#include "coretypes.h"
|
|
#include "vec-perm-indices.h"
|
|
#include "tree.h"
|
|
#include "fold-const.h"
|
|
#include "tree-vector-builder.h"
|
|
#include "backend.h"
|
|
#include "rtl.h"
|
|
#include "memmodel.h"
|
|
#include "emit-rtl.h"
|
|
#include "selftest.h"
|
|
#include "rtx-vector-builder.h"
|
|
|
|
/* Switch to a new permutation vector that selects between NINPUTS vector
|
|
inputs that have NELTS_PER_INPUT elements each. Take the elements of the
|
|
new permutation vector from ELEMENTS, clamping each one to be in range. */
|
|
|
|
void
|
|
vec_perm_indices::new_vector (const vec_perm_builder &elements,
|
|
unsigned int ninputs,
|
|
poly_uint64 nelts_per_input)
|
|
{
|
|
m_ninputs = ninputs;
|
|
m_nelts_per_input = nelts_per_input;
|
|
/* If the vector has a constant number of elements, expand the
|
|
encoding and clamp each element. E.g. { 0, 2, 4, ... } might
|
|
wrap halfway if there is only one vector input, and we want
|
|
the wrapped form to be the canonical one.
|
|
|
|
If the vector has a variable number of elements, just copy
|
|
the encoding. In that case the unwrapped form is canonical
|
|
and there is no way of representing the wrapped form. */
|
|
poly_uint64 full_nelts = elements.full_nelts ();
|
|
unsigned HOST_WIDE_INT copy_nelts;
|
|
if (full_nelts.is_constant (©_nelts))
|
|
m_encoding.new_vector (full_nelts, copy_nelts, 1);
|
|
else
|
|
{
|
|
copy_nelts = elements.encoded_nelts ();
|
|
m_encoding.new_vector (full_nelts, elements.npatterns (),
|
|
elements.nelts_per_pattern ());
|
|
}
|
|
unsigned int npatterns = m_encoding.npatterns ();
|
|
for (unsigned int i = 0; i < npatterns; ++i)
|
|
m_encoding.quick_push (clamp (elements.elt (i)));
|
|
/* Use the fact that:
|
|
|
|
(a + b) % c == ((a % c) + (b % c)) % c
|
|
|
|
to simplify the clamping of variable-length vectors. */
|
|
for (unsigned int i = npatterns; i < copy_nelts; ++i)
|
|
{
|
|
element_type step = clamp (elements.elt (i)
|
|
- elements.elt (i - npatterns));
|
|
m_encoding.quick_push (clamp (m_encoding[i - npatterns] + step));
|
|
}
|
|
m_encoding.finalize ();
|
|
}
|
|
|
|
/* Switch to a new permutation vector that selects the same input elements
|
|
as ORIG, but with each element split into FACTOR pieces. For example,
|
|
if ORIG is { 1, 2, 0, 3 } and FACTOR is 2, the new permutation is
|
|
{ 2, 3, 4, 5, 0, 1, 6, 7 }. */
|
|
|
|
void
|
|
vec_perm_indices::new_expanded_vector (const vec_perm_indices &orig,
|
|
unsigned int factor)
|
|
{
|
|
m_ninputs = orig.m_ninputs;
|
|
m_nelts_per_input = orig.m_nelts_per_input * factor;
|
|
m_encoding.new_vector (orig.m_encoding.full_nelts () * factor,
|
|
orig.m_encoding.npatterns () * factor,
|
|
orig.m_encoding.nelts_per_pattern ());
|
|
unsigned int encoded_nelts = orig.m_encoding.encoded_nelts ();
|
|
for (unsigned int i = 0; i < encoded_nelts; ++i)
|
|
{
|
|
element_type base = orig.m_encoding[i] * factor;
|
|
for (unsigned int j = 0; j < factor; ++j)
|
|
m_encoding.quick_push (base + j);
|
|
}
|
|
m_encoding.finalize ();
|
|
}
|
|
|
|
/* Check whether we can switch to a new permutation vector that
|
|
selects the same input elements as ORIG, but with each element
|
|
built up from FACTOR pieces. Return true if yes, otherwise
|
|
return false. Every FACTOR permutation indexes should be
|
|
continuous separately and the first one of each batch should
|
|
be able to exactly modulo FACTOR. For example, if ORIG is
|
|
{ 2, 3, 4, 5, 0, 1, 6, 7 } and FACTOR is 2, the new permutation
|
|
is { 1, 2, 0, 3 }. */
|
|
|
|
bool
|
|
vec_perm_indices::new_shrunk_vector (const vec_perm_indices &orig,
|
|
unsigned int factor)
|
|
{
|
|
gcc_assert (factor > 0);
|
|
|
|
if (maybe_lt (orig.m_nelts_per_input, factor))
|
|
return false;
|
|
|
|
poly_uint64 nelts;
|
|
/* Invalid if vector units number isn't multiple of factor. */
|
|
if (!multiple_p (orig.m_nelts_per_input, factor, &nelts))
|
|
return false;
|
|
|
|
/* Only handle the case that npatterns is multiple of factor.
|
|
FIXME: Try to see whether we can reshape it by factor npatterns. */
|
|
if (orig.m_encoding.npatterns () % factor != 0)
|
|
return false;
|
|
|
|
unsigned int encoded_nelts = orig.m_encoding.encoded_nelts ();
|
|
auto_vec<element_type, 32> encoding (encoded_nelts);
|
|
/* Separate all encoded elements into batches by size factor,
|
|
then ensure the first element of each batch is multiple of
|
|
factor and all elements in each batch is consecutive from
|
|
the first one. */
|
|
for (unsigned int i = 0; i < encoded_nelts; i += factor)
|
|
{
|
|
element_type first = orig.m_encoding[i];
|
|
element_type new_index;
|
|
if (!multiple_p (first, factor, &new_index))
|
|
return false;
|
|
for (unsigned int j = 1; j < factor; ++j)
|
|
if (maybe_ne (first + j, orig.m_encoding[i + j]))
|
|
return false;
|
|
encoding.quick_push (new_index);
|
|
}
|
|
|
|
m_ninputs = orig.m_ninputs;
|
|
m_nelts_per_input = nelts;
|
|
poly_uint64 full_nelts = exact_div (orig.m_encoding.full_nelts (), factor);
|
|
unsigned int npatterns = orig.m_encoding.npatterns () / factor;
|
|
|
|
m_encoding.new_vector (full_nelts, npatterns,
|
|
orig.m_encoding.nelts_per_pattern ());
|
|
m_encoding.splice (encoding);
|
|
m_encoding.finalize ();
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Rotate the inputs of the permutation right by DELTA inputs. This changes
|
|
the values of the permutation vector but it doesn't change the way that
|
|
the elements are encoded. */
|
|
|
|
void
|
|
vec_perm_indices::rotate_inputs (int delta)
|
|
{
|
|
element_type element_delta = delta * m_nelts_per_input;
|
|
for (unsigned int i = 0; i < m_encoding.length (); ++i)
|
|
m_encoding[i] = clamp (m_encoding[i] + element_delta);
|
|
}
|
|
|
|
/* Return true if index OUT_BASE + I * OUT_STEP selects input
|
|
element IN_BASE + I * IN_STEP. For example, the call to test
|
|
whether a permute reverses a vector of N elements would be:
|
|
|
|
series_p (0, 1, N - 1, -1)
|
|
|
|
which would return true for { N - 1, N - 2, N - 3, ... }.
|
|
The calls to test for an interleaving of elements starting
|
|
at N1 and N2 would be:
|
|
|
|
series_p (0, 2, N1, 1) && series_p (1, 2, N2, 1).
|
|
|
|
which would return true for { N1, N2, N1 + 1, N2 + 1, ... }. */
|
|
|
|
bool
|
|
vec_perm_indices::series_p (unsigned int out_base, unsigned int out_step,
|
|
element_type in_base, element_type in_step) const
|
|
{
|
|
/* Check the base value. */
|
|
if (maybe_ne (clamp (m_encoding.elt (out_base)), clamp (in_base)))
|
|
return false;
|
|
|
|
element_type full_nelts = m_encoding.full_nelts ();
|
|
unsigned int npatterns = m_encoding.npatterns ();
|
|
|
|
/* Calculate which multiple of OUT_STEP elements we need to get
|
|
back to the same pattern. */
|
|
unsigned int cycle_length = least_common_multiple (out_step, npatterns);
|
|
|
|
/* Check the steps. */
|
|
in_step = clamp (in_step);
|
|
out_base += out_step;
|
|
unsigned int limit = 0;
|
|
for (;;)
|
|
{
|
|
/* Succeed if we've checked all the elements in the vector. */
|
|
if (known_ge (out_base, full_nelts))
|
|
return true;
|
|
|
|
if (out_base >= npatterns)
|
|
{
|
|
/* We've got to the end of the "foreground" values. Check
|
|
2 elements from each pattern in the "background" values. */
|
|
if (limit == 0)
|
|
limit = out_base + cycle_length * 2;
|
|
else if (out_base >= limit)
|
|
return true;
|
|
}
|
|
|
|
element_type v0 = m_encoding.elt (out_base - out_step);
|
|
element_type v1 = m_encoding.elt (out_base);
|
|
if (maybe_ne (clamp (v1 - v0), in_step))
|
|
return false;
|
|
|
|
out_base += out_step;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* Return true if all elements of the permutation vector are in the range
|
|
[START, START + SIZE). */
|
|
|
|
bool
|
|
vec_perm_indices::all_in_range_p (element_type start, element_type size) const
|
|
{
|
|
/* Check the first two elements of each pattern. */
|
|
unsigned int npatterns = m_encoding.npatterns ();
|
|
unsigned int nelts_per_pattern = m_encoding.nelts_per_pattern ();
|
|
unsigned int base_nelts = npatterns * MIN (nelts_per_pattern, 2);
|
|
for (unsigned int i = 0; i < base_nelts; ++i)
|
|
if (!known_in_range_p (m_encoding[i], start, size))
|
|
return false;
|
|
|
|
/* For stepped encodings, check the full range of the series. */
|
|
if (nelts_per_pattern == 3)
|
|
{
|
|
element_type limit = input_nelts ();
|
|
|
|
/* The number of elements in each pattern beyond the first two
|
|
that we checked above. */
|
|
poly_int64 step_nelts = exact_div (m_encoding.full_nelts (),
|
|
npatterns) - 2;
|
|
for (unsigned int i = 0; i < npatterns; ++i)
|
|
{
|
|
/* BASE1 has been checked but BASE2 hasn't. */
|
|
element_type base1 = m_encoding[i + npatterns];
|
|
element_type base2 = m_encoding[i + base_nelts];
|
|
|
|
/* The step to add to get from BASE1 to each subsequent value. */
|
|
element_type step = clamp (base2 - base1);
|
|
|
|
/* STEP has no inherent sign, so a value near LIMIT can
|
|
act as a negative step. The series is in range if it
|
|
is in range according to one of the two interpretations.
|
|
|
|
Since we're dealing with clamped values, ELEMENT_TYPE is
|
|
wide enough for overflow not to be a problem. */
|
|
element_type headroom_down = base1 - start;
|
|
element_type headroom_up = size - headroom_down - 1;
|
|
HOST_WIDE_INT diff;
|
|
if ((!step.is_constant (&diff)
|
|
|| maybe_lt (headroom_up, diff * step_nelts))
|
|
&& (!(limit - step).is_constant (&diff)
|
|
|| maybe_lt (headroom_down, diff * step_nelts)))
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* Try to read the contents of VECTOR_CST CST as a constant permutation
|
|
vector. Return true and add the elements to BUILDER on success,
|
|
otherwise return false without modifying BUILDER. */
|
|
|
|
bool
|
|
tree_to_vec_perm_builder (vec_perm_builder *builder, tree cst)
|
|
{
|
|
unsigned int encoded_nelts = vector_cst_encoded_nelts (cst);
|
|
for (unsigned int i = 0; i < encoded_nelts; ++i)
|
|
if (!tree_fits_poly_int64_p (VECTOR_CST_ENCODED_ELT (cst, i)))
|
|
return false;
|
|
|
|
builder->new_vector (TYPE_VECTOR_SUBPARTS (TREE_TYPE (cst)),
|
|
VECTOR_CST_NPATTERNS (cst),
|
|
VECTOR_CST_NELTS_PER_PATTERN (cst));
|
|
for (unsigned int i = 0; i < encoded_nelts; ++i)
|
|
builder->quick_push (tree_to_poly_int64 (VECTOR_CST_ENCODED_ELT (cst, i)));
|
|
return true;
|
|
}
|
|
|
|
/* Return a VECTOR_CST of type TYPE for the permutation vector in INDICES. */
|
|
|
|
tree
|
|
vec_perm_indices_to_tree (tree type, const vec_perm_indices &indices)
|
|
{
|
|
gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (type), indices.length ()));
|
|
tree_vector_builder sel (type, indices.encoding ().npatterns (),
|
|
indices.encoding ().nelts_per_pattern ());
|
|
unsigned int encoded_nelts = sel.encoded_nelts ();
|
|
for (unsigned int i = 0; i < encoded_nelts; i++)
|
|
sel.quick_push (build_int_cst (TREE_TYPE (type), indices[i]));
|
|
return sel.build ();
|
|
}
|
|
|
|
/* Return a CONST_VECTOR of mode MODE that contains the elements of
|
|
INDICES. */
|
|
|
|
rtx
|
|
vec_perm_indices_to_rtx (machine_mode mode, const vec_perm_indices &indices)
|
|
{
|
|
gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|
|
&& known_eq (GET_MODE_NUNITS (mode), indices.length ()));
|
|
rtx_vector_builder sel (mode, indices.encoding ().npatterns (),
|
|
indices.encoding ().nelts_per_pattern ());
|
|
unsigned int encoded_nelts = sel.encoded_nelts ();
|
|
for (unsigned int i = 0; i < encoded_nelts; i++)
|
|
sel.quick_push (gen_int_mode (indices[i], GET_MODE_INNER (mode)));
|
|
return sel.build ();
|
|
}
|
|
|
|
#if CHECKING_P
|
|
|
|
namespace selftest {
|
|
|
|
/* Test a 12-element vector. */
|
|
|
|
static void
|
|
test_vec_perm_12 (void)
|
|
{
|
|
vec_perm_builder builder (12, 12, 1);
|
|
for (unsigned int i = 0; i < 4; ++i)
|
|
{
|
|
builder.quick_push (i * 5);
|
|
builder.quick_push (3 + i);
|
|
builder.quick_push (2 + 3 * i);
|
|
}
|
|
vec_perm_indices indices (builder, 1, 12);
|
|
ASSERT_TRUE (indices.series_p (0, 3, 0, 5));
|
|
ASSERT_FALSE (indices.series_p (0, 3, 3, 5));
|
|
ASSERT_FALSE (indices.series_p (0, 3, 0, 8));
|
|
ASSERT_TRUE (indices.series_p (1, 3, 3, 1));
|
|
ASSERT_TRUE (indices.series_p (2, 3, 2, 3));
|
|
|
|
ASSERT_TRUE (indices.series_p (0, 4, 0, 4));
|
|
ASSERT_FALSE (indices.series_p (1, 4, 3, 4));
|
|
|
|
ASSERT_TRUE (indices.series_p (0, 6, 0, 10));
|
|
ASSERT_FALSE (indices.series_p (0, 6, 0, 100));
|
|
|
|
ASSERT_FALSE (indices.series_p (1, 10, 3, 7));
|
|
ASSERT_TRUE (indices.series_p (1, 10, 3, 8));
|
|
|
|
ASSERT_TRUE (indices.series_p (0, 12, 0, 10));
|
|
ASSERT_TRUE (indices.series_p (0, 12, 0, 11));
|
|
ASSERT_TRUE (indices.series_p (0, 12, 0, 100));
|
|
}
|
|
|
|
/* Run selftests for this file. */
|
|
|
|
void
|
|
vec_perm_indices_c_tests ()
|
|
{
|
|
test_vec_perm_12 ();
|
|
}
|
|
|
|
} // namespace selftest
|
|
|
|
#endif
|