2022-01-03 10:42:10 +01:00
|
|
|
/* Copyright (C) 1988-2022 Free Software Foundation, Inc.
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
This file is part of GCC.
|
|
|
|
|
|
|
|
GCC is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; either version 3, or (at your option)
|
|
|
|
any later version.
|
|
|
|
|
|
|
|
GCC is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with GCC; see the file COPYING3. If not see
|
|
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
|
|
|
|
#define IN_TARGET_CODE 1
|
|
|
|
|
|
|
|
#include "config.h"
|
|
|
|
#include "system.h"
|
|
|
|
#include "coretypes.h"
|
|
|
|
#include "backend.h"
|
|
|
|
#include "rtl.h"
|
|
|
|
#include "tree.h"
|
|
|
|
#include "memmodel.h"
|
|
|
|
#include "gimple.h"
|
|
|
|
#include "cfghooks.h"
|
|
|
|
#include "cfgloop.h"
|
|
|
|
#include "df.h"
|
|
|
|
#include "tm_p.h"
|
|
|
|
#include "stringpool.h"
|
|
|
|
#include "expmed.h"
|
|
|
|
#include "optabs.h"
|
|
|
|
#include "regs.h"
|
|
|
|
#include "emit-rtl.h"
|
|
|
|
#include "recog.h"
|
|
|
|
#include "cgraph.h"
|
|
|
|
#include "diagnostic.h"
|
|
|
|
#include "cfgbuild.h"
|
|
|
|
#include "alias.h"
|
|
|
|
#include "fold-const.h"
|
|
|
|
#include "attribs.h"
|
|
|
|
#include "calls.h"
|
|
|
|
#include "stor-layout.h"
|
|
|
|
#include "varasm.h"
|
|
|
|
#include "output.h"
|
|
|
|
#include "insn-attr.h"
|
|
|
|
#include "flags.h"
|
|
|
|
#include "except.h"
|
|
|
|
#include "explow.h"
|
|
|
|
#include "expr.h"
|
|
|
|
#include "cfgrtl.h"
|
|
|
|
#include "common/common-target.h"
|
|
|
|
#include "langhooks.h"
|
|
|
|
#include "reload.h"
|
|
|
|
#include "gimplify.h"
|
|
|
|
#include "dwarf2.h"
|
|
|
|
#include "tm-constrs.h"
|
|
|
|
#include "cselib.h"
|
|
|
|
#include "sched-int.h"
|
|
|
|
#include "opts.h"
|
|
|
|
#include "tree-pass.h"
|
|
|
|
#include "context.h"
|
|
|
|
#include "pass_manager.h"
|
|
|
|
#include "target-globals.h"
|
|
|
|
#include "gimple-iterator.h"
|
|
|
|
#include "tree-vectorizer.h"
|
|
|
|
#include "shrink-wrap.h"
|
|
|
|
#include "builtins.h"
|
|
|
|
#include "rtl-iter.h"
|
|
|
|
#include "tree-iterator.h"
|
|
|
|
#include "dbgcnt.h"
|
|
|
|
#include "case-cfn-macros.h"
|
|
|
|
#include "dojump.h"
|
|
|
|
#include "fold-const-call.h"
|
|
|
|
#include "tree-vrp.h"
|
|
|
|
#include "tree-ssanames.h"
|
|
|
|
#include "selftest.h"
|
|
|
|
#include "selftest-rtl.h"
|
|
|
|
#include "print-rtl.h"
|
|
|
|
#include "intl.h"
|
|
|
|
#include "ifcvt.h"
|
|
|
|
#include "symbol-summary.h"
|
|
|
|
#include "ipa-prop.h"
|
|
|
|
#include "ipa-fnsummary.h"
|
|
|
|
#include "wide-int-bitmask.h"
|
|
|
|
#include "tree-vector-builder.h"
|
|
|
|
#include "debug.h"
|
|
|
|
#include "dwarf2out.h"
|
|
|
|
#include "i386-builtins.h"
|
|
|
|
#include "i386-features.h"
|
|
|
|
|
|
|
|
const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
|
|
|
|
"savms64",
|
|
|
|
"resms64",
|
|
|
|
"resms64x",
|
|
|
|
"savms64f",
|
|
|
|
"resms64f",
|
|
|
|
"resms64fx"
|
|
|
|
};
|
|
|
|
|
|
|
|
const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
|
|
|
|
/* The below offset values are where each register is stored for the layout
|
|
|
|
relative to incoming stack pointer. The value of each m_regs[].offset will
|
|
|
|
be relative to the incoming base pointer (rax or rsi) used by the stub.
|
|
|
|
|
|
|
|
s_instances: 0 1 2 3
|
|
|
|
Offset: realigned or aligned + 8
|
|
|
|
Register aligned aligned + 8 aligned w/HFP w/HFP */
|
|
|
|
XMM15_REG, /* 0x10 0x18 0x10 0x18 */
|
|
|
|
XMM14_REG, /* 0x20 0x28 0x20 0x28 */
|
|
|
|
XMM13_REG, /* 0x30 0x38 0x30 0x38 */
|
|
|
|
XMM12_REG, /* 0x40 0x48 0x40 0x48 */
|
|
|
|
XMM11_REG, /* 0x50 0x58 0x50 0x58 */
|
|
|
|
XMM10_REG, /* 0x60 0x68 0x60 0x68 */
|
|
|
|
XMM9_REG, /* 0x70 0x78 0x70 0x78 */
|
|
|
|
XMM8_REG, /* 0x80 0x88 0x80 0x88 */
|
|
|
|
XMM7_REG, /* 0x90 0x98 0x90 0x98 */
|
|
|
|
XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
|
|
|
|
SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
|
|
|
|
DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
|
|
|
|
BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
|
|
|
|
BP_REG, /* 0xc0 0xc8 N/A N/A */
|
|
|
|
R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
|
|
|
|
R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
|
|
|
|
R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
|
|
|
|
R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Instantiate static const values. */
|
|
|
|
const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
|
|
|
|
const unsigned xlogue_layout::MIN_REGS;
|
|
|
|
const unsigned xlogue_layout::MAX_REGS;
|
|
|
|
const unsigned xlogue_layout::MAX_EXTRA_REGS;
|
|
|
|
const unsigned xlogue_layout::VARIANT_COUNT;
|
|
|
|
const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
|
|
|
|
|
|
|
|
/* Initialize xlogue_layout::s_stub_names to zero. */
|
|
|
|
char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
|
|
|
|
[STUB_NAME_MAX_LEN];
|
|
|
|
|
|
|
|
/* Instantiates all xlogue_layout instances. */
|
|
|
|
const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
|
|
|
|
xlogue_layout (0, false),
|
|
|
|
xlogue_layout (8, false),
|
|
|
|
xlogue_layout (0, true),
|
|
|
|
xlogue_layout (8, true)
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Return an appropriate const instance of xlogue_layout based upon values
|
|
|
|
in cfun->machine and crtl. */
|
2019-07-09 20:32:49 +02:00
|
|
|
const class xlogue_layout &
|
2019-05-06 09:18:26 +02:00
|
|
|
xlogue_layout::get_instance ()
|
|
|
|
{
|
|
|
|
enum xlogue_stub_sets stub_set;
|
|
|
|
bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
|
|
|
|
|
|
|
|
if (stack_realign_fp)
|
|
|
|
stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
|
|
|
|
else if (frame_pointer_needed)
|
|
|
|
stub_set = aligned_plus_8
|
|
|
|
? XLOGUE_SET_HFP_ALIGNED_PLUS_8
|
|
|
|
: XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
|
|
|
|
else
|
|
|
|
stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
|
|
|
|
|
|
|
|
return s_instances[stub_set];
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Determine how many clobbered registers can be saved by the stub.
|
|
|
|
Returns the count of registers the stub will save and restore. */
|
|
|
|
unsigned
|
|
|
|
xlogue_layout::count_stub_managed_regs ()
|
|
|
|
{
|
|
|
|
bool hfp = frame_pointer_needed || stack_realign_fp;
|
|
|
|
unsigned i, count;
|
|
|
|
unsigned regno;
|
|
|
|
|
|
|
|
for (count = i = MIN_REGS; i < MAX_REGS; ++i)
|
|
|
|
{
|
|
|
|
regno = REG_ORDER[i];
|
|
|
|
if (regno == BP_REG && hfp)
|
|
|
|
continue;
|
|
|
|
if (!ix86_save_reg (regno, false, false))
|
|
|
|
break;
|
|
|
|
++count;
|
|
|
|
}
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Determine if register REGNO is a stub managed register given the
|
|
|
|
total COUNT of stub managed registers. */
|
|
|
|
bool
|
|
|
|
xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
|
|
|
|
{
|
|
|
|
bool hfp = frame_pointer_needed || stack_realign_fp;
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
for (i = 0; i < count; ++i)
|
|
|
|
{
|
|
|
|
gcc_assert (i < MAX_REGS);
|
|
|
|
if (REG_ORDER[i] == BP_REG && hfp)
|
|
|
|
++count;
|
|
|
|
else if (REG_ORDER[i] == regno)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Constructor for xlogue_layout. */
|
|
|
|
xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
|
|
|
|
: m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
|
|
|
|
m_stack_align_off_in (stack_align_off_in)
|
|
|
|
{
|
|
|
|
HOST_WIDE_INT offset = stack_align_off_in;
|
|
|
|
unsigned i, j;
|
|
|
|
|
|
|
|
for (i = j = 0; i < MAX_REGS; ++i)
|
|
|
|
{
|
|
|
|
unsigned regno = REG_ORDER[i];
|
|
|
|
|
|
|
|
if (regno == BP_REG && hfp)
|
|
|
|
continue;
|
|
|
|
if (SSE_REGNO_P (regno))
|
|
|
|
{
|
|
|
|
offset += 16;
|
|
|
|
/* Verify that SSE regs are always aligned. */
|
|
|
|
gcc_assert (!((stack_align_off_in + offset) & 15));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
offset += 8;
|
|
|
|
|
|
|
|
m_regs[j].regno = regno;
|
|
|
|
m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
|
|
|
|
}
|
|
|
|
gcc_assert (j == m_nregs);
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *
|
|
|
|
xlogue_layout::get_stub_name (enum xlogue_stub stub,
|
|
|
|
unsigned n_extra_regs)
|
|
|
|
{
|
|
|
|
const int have_avx = TARGET_AVX;
|
|
|
|
char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
|
|
|
|
|
|
|
|
/* Lazy init */
|
|
|
|
if (!*name)
|
|
|
|
{
|
|
|
|
int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
|
|
|
|
(have_avx ? "avx" : "sse"),
|
|
|
|
STUB_BASE_NAMES[stub],
|
|
|
|
MIN_REGS + n_extra_regs);
|
|
|
|
gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
|
|
|
|
}
|
|
|
|
|
|
|
|
return name;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return rtx of a symbol ref for the entry point (based upon
|
|
|
|
cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
|
|
|
|
rtx
|
|
|
|
xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
|
|
|
|
{
|
|
|
|
const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
|
|
|
|
gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
|
|
|
|
gcc_assert (stub < XLOGUE_STUB_COUNT);
|
|
|
|
gcc_assert (crtl->stack_realign_finalized);
|
|
|
|
|
|
|
|
return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned scalar_chain::max_id = 0;
|
|
|
|
|
2019-08-27 09:39:34 +02:00
|
|
|
namespace {
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
/* Initialize new chain. */
|
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2019-08-14 14:04:05 +02:00
|
|
|
smode = smode_;
|
|
|
|
vmode = vmode_;
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
chain_id = ++max_id;
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
|
|
|
|
|
|
|
|
bitmap_obstack_initialize (NULL);
|
|
|
|
insns = BITMAP_ALLOC (NULL);
|
|
|
|
defs = BITMAP_ALLOC (NULL);
|
|
|
|
defs_conv = BITMAP_ALLOC (NULL);
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
insns_conv = BITMAP_ALLOC (NULL);
|
2019-05-06 09:18:26 +02:00
|
|
|
queue = NULL;
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
|
|
|
|
n_sse_to_integer = 0;
|
|
|
|
n_integer_to_sse = 0;
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Free chain's data. */
|
|
|
|
|
|
|
|
scalar_chain::~scalar_chain ()
|
|
|
|
{
|
|
|
|
BITMAP_FREE (insns);
|
|
|
|
BITMAP_FREE (defs);
|
|
|
|
BITMAP_FREE (defs_conv);
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
BITMAP_FREE (insns_conv);
|
2019-05-06 09:18:26 +02:00
|
|
|
bitmap_obstack_release (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Add instruction into chains' queue. */
|
|
|
|
|
|
|
|
void
|
|
|
|
scalar_chain::add_to_queue (unsigned insn_uid)
|
|
|
|
{
|
|
|
|
if (bitmap_bit_p (insns, insn_uid)
|
|
|
|
|| bitmap_bit_p (queue, insn_uid))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
|
|
|
|
insn_uid, chain_id);
|
|
|
|
bitmap_set_bit (queue, insn_uid);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* For DImode conversion, mark register defined by DEF as requiring
|
|
|
|
conversion. */
|
|
|
|
|
|
|
|
void
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
scalar_chain::mark_dual_mode_def (df_ref def)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
gcc_assert (DF_REF_REG_DEF_P (def));
|
|
|
|
|
2019-08-27 14:46:07 +02:00
|
|
|
/* Record the def/insn pair so we can later efficiently iterate over
|
|
|
|
the defs to convert on insns not in the chain. */
|
|
|
|
bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
|
|
|
|
if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
|
|
|
|
{
|
|
|
|
if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
|
|
|
|
&& !reg_new)
|
|
|
|
return;
|
|
|
|
n_integer_to_sse++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (!reg_new)
|
|
|
|
return;
|
|
|
|
n_sse_to_integer++;
|
|
|
|
}
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file,
|
|
|
|
" Mark r%d def in insn %d as requiring both modes in chain #%d\n",
|
|
|
|
DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check REF's chain to add new insns into a queue
|
|
|
|
and find registers requiring conversion. */
|
|
|
|
|
|
|
|
void
|
|
|
|
scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
|
|
|
|
{
|
|
|
|
df_link *chain;
|
|
|
|
|
|
|
|
gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
|
|
|
|
|| bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
|
|
|
|
add_to_queue (DF_REF_INSN_UID (ref));
|
|
|
|
|
|
|
|
for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
|
|
|
|
{
|
|
|
|
unsigned uid = DF_REF_INSN_UID (chain->ref);
|
|
|
|
|
|
|
|
if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!DF_REF_REG_MEM_P (chain->ref))
|
|
|
|
{
|
|
|
|
if (bitmap_bit_p (insns, uid))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (bitmap_bit_p (candidates, uid))
|
|
|
|
{
|
|
|
|
add_to_queue (uid);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (DF_REF_REG_DEF_P (chain->ref))
|
|
|
|
{
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
|
|
|
|
DF_REF_REGNO (chain->ref), uid);
|
|
|
|
mark_dual_mode_def (chain->ref);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
|
|
|
|
DF_REF_REGNO (chain->ref), uid);
|
|
|
|
mark_dual_mode_def (ref);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Add instruction into a chain. */
|
|
|
|
|
|
|
|
void
|
|
|
|
scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
|
|
|
|
{
|
|
|
|
if (bitmap_bit_p (insns, insn_uid))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
|
|
|
|
|
|
|
|
bitmap_set_bit (insns, insn_uid);
|
|
|
|
|
|
|
|
rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
|
|
|
|
rtx def_set = single_set (insn);
|
|
|
|
if (def_set && REG_P (SET_DEST (def_set))
|
|
|
|
&& !HARD_REGISTER_P (SET_DEST (def_set)))
|
|
|
|
bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
|
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
/* ??? The following is quadratic since analyze_register_chain
|
|
|
|
iterates over all refs to look for dual-mode regs. Instead this
|
|
|
|
should be done separately for all regs mentioned in the chain once. */
|
2019-05-06 09:18:26 +02:00
|
|
|
df_ref ref;
|
|
|
|
for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
|
|
|
|
if (!HARD_REGISTER_P (DF_REF_REG (ref)))
|
2019-08-26 12:35:59 +02:00
|
|
|
analyze_register_chain (candidates, ref);
|
2019-05-06 09:18:26 +02:00
|
|
|
for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
|
|
|
|
if (!DF_REF_REG_MEM_P (ref))
|
|
|
|
analyze_register_chain (candidates, ref);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Build new chain starting from insn INSN_UID recursively
|
|
|
|
adding all dependent uses and definitions. */
|
|
|
|
|
|
|
|
void
|
|
|
|
scalar_chain::build (bitmap candidates, unsigned insn_uid)
|
|
|
|
{
|
|
|
|
queue = BITMAP_ALLOC (NULL);
|
|
|
|
bitmap_set_bit (queue, insn_uid);
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, "Building chain #%d...\n", chain_id);
|
|
|
|
|
|
|
|
while (!bitmap_empty_p (queue))
|
|
|
|
{
|
|
|
|
insn_uid = bitmap_first_set_bit (queue);
|
|
|
|
bitmap_clear_bit (queue, insn_uid);
|
|
|
|
bitmap_clear_bit (candidates, insn_uid);
|
|
|
|
add_insn (candidates, insn_uid);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
{
|
|
|
|
fprintf (dump_file, "Collected chain #%d...\n", chain_id);
|
|
|
|
fprintf (dump_file, " insns: ");
|
|
|
|
dump_bitmap (dump_file, insns);
|
|
|
|
if (!bitmap_empty_p (defs_conv))
|
|
|
|
{
|
|
|
|
bitmap_iterator bi;
|
|
|
|
unsigned id;
|
|
|
|
const char *comma = "";
|
|
|
|
fprintf (dump_file, " defs to convert: ");
|
|
|
|
EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
|
|
|
|
{
|
|
|
|
fprintf (dump_file, "%sr%d", comma, id);
|
|
|
|
comma = ", ";
|
|
|
|
}
|
|
|
|
fprintf (dump_file, "\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
BITMAP_FREE (queue);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return a cost of building a vector costant
|
|
|
|
instead of using a scalar one. */
|
|
|
|
|
|
|
|
int
|
2019-08-14 14:04:05 +02:00
|
|
|
general_scalar_chain::vector_const_cost (rtx exp)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
gcc_assert (CONST_INT_P (exp));
|
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
if (standard_sse_constant_p (exp, vmode))
|
|
|
|
return ix86_cost->sse_op;
|
|
|
|
/* We have separate costs for SImode and DImode, use SImode costs
|
|
|
|
for smaller modes. */
|
|
|
|
return ix86_cost->sse_load[smode == DImode ? 1 : 0];
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Compute a gain for chain conversion. */
|
|
|
|
|
|
|
|
int
|
2019-08-14 14:04:05 +02:00
|
|
|
general_scalar_chain::compute_convert_gain ()
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
bitmap_iterator bi;
|
|
|
|
unsigned insn_uid;
|
|
|
|
int gain = 0;
|
|
|
|
int cost = 0;
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
|
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
/* SSE costs distinguish between SImode and DImode loads/stores, for
|
|
|
|
int costs factor in the number of GPRs involved. When supporting
|
|
|
|
smaller modes than SImode the int load/store costs need to be
|
|
|
|
adjusted as well. */
|
|
|
|
unsigned sse_cost_idx = smode == DImode ? 1 : 0;
|
|
|
|
unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
|
|
|
|
{
|
|
|
|
rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
|
|
|
|
rtx def_set = single_set (insn);
|
|
|
|
rtx src = SET_SRC (def_set);
|
|
|
|
rtx dst = SET_DEST (def_set);
|
2019-08-14 10:31:54 +02:00
|
|
|
int igain = 0;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
if (REG_P (src) && REG_P (dst))
|
2019-08-14 14:04:05 +02:00
|
|
|
igain += 2 * m - ix86_cost->xmm_move;
|
2019-05-06 09:18:26 +02:00
|
|
|
else if (REG_P (src) && MEM_P (dst))
|
2019-08-14 14:04:05 +02:00
|
|
|
igain
|
|
|
|
+= m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx];
|
2019-05-06 09:18:26 +02:00
|
|
|
else if (MEM_P (src) && REG_P (dst))
|
2019-08-14 14:04:05 +02:00
|
|
|
igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
|
2021-07-01 10:56:32 +02:00
|
|
|
else
|
|
|
|
switch (GET_CODE (src))
|
|
|
|
{
|
|
|
|
case ASHIFT:
|
|
|
|
case ASHIFTRT:
|
|
|
|
case LSHIFTRT:
|
|
|
|
if (m == 2)
|
|
|
|
{
|
|
|
|
if (INTVAL (XEXP (src, 1)) >= 32)
|
|
|
|
igain += ix86_cost->add;
|
|
|
|
else
|
|
|
|
igain += ix86_cost->shift_const;
|
|
|
|
}
|
2019-08-29 21:47:19 +02:00
|
|
|
|
2021-07-01 10:56:32 +02:00
|
|
|
igain += ix86_cost->shift_const - ix86_cost->sse_op;
|
2019-08-29 21:47:19 +02:00
|
|
|
|
2021-07-01 10:56:32 +02:00
|
|
|
if (CONST_INT_P (XEXP (src, 0)))
|
|
|
|
igain -= vector_const_cost (XEXP (src, 0));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case AND:
|
|
|
|
case IOR:
|
|
|
|
case XOR:
|
|
|
|
case PLUS:
|
|
|
|
case MINUS:
|
|
|
|
igain += m * ix86_cost->add - ix86_cost->sse_op;
|
|
|
|
/* Additional gain for andnot for targets without BMI. */
|
|
|
|
if (GET_CODE (XEXP (src, 0)) == NOT
|
|
|
|
&& !TARGET_BMI)
|
|
|
|
igain += m * ix86_cost->add;
|
|
|
|
|
|
|
|
if (CONST_INT_P (XEXP (src, 0)))
|
|
|
|
igain -= vector_const_cost (XEXP (src, 0));
|
|
|
|
if (CONST_INT_P (XEXP (src, 1)))
|
|
|
|
igain -= vector_const_cost (XEXP (src, 1));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NEG:
|
|
|
|
case NOT:
|
|
|
|
igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
|
|
|
|
|
|
|
|
if (GET_CODE (XEXP (src, 0)) != ABS)
|
|
|
|
{
|
|
|
|
igain += m * ix86_cost->add;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* FALLTHRU */
|
|
|
|
|
|
|
|
case ABS:
|
|
|
|
case SMAX:
|
|
|
|
case SMIN:
|
|
|
|
case UMAX:
|
|
|
|
case UMIN:
|
|
|
|
/* We do not have any conditional move cost, estimate it as a
|
|
|
|
reg-reg move. Comparisons are costed as adds. */
|
|
|
|
igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
|
|
|
|
/* Integer SSE ops are all costed the same. */
|
|
|
|
igain -= ix86_cost->sse_op;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case COMPARE:
|
|
|
|
/* Assume comparison cost is the same. */
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CONST_INT:
|
|
|
|
if (REG_P (dst))
|
2021-08-24 04:04:48 +02:00
|
|
|
{
|
|
|
|
if (optimize_insn_for_size_p ())
|
|
|
|
{
|
|
|
|
/* xor (2 bytes) vs. xorps (3 bytes). */
|
|
|
|
if (src == const0_rtx)
|
|
|
|
igain -= COSTS_N_BYTES (1);
|
|
|
|
/* movdi_internal vs. movv2di_internal. */
|
|
|
|
/* => mov (5 bytes) vs. movaps (7 bytes). */
|
|
|
|
else if (x86_64_immediate_operand (src, SImode))
|
|
|
|
igain -= COSTS_N_BYTES (2);
|
|
|
|
else
|
|
|
|
/* ??? Larger immediate constants are placed in the
|
|
|
|
constant pool, where the size benefit/impact of
|
|
|
|
STV conversion is affected by whether and how
|
|
|
|
often each constant pool entry is shared/reused.
|
|
|
|
The value below is empirically derived from the
|
|
|
|
CSiBE benchmark (and the optimal value may drift
|
|
|
|
over time). */
|
|
|
|
igain += COSTS_N_BYTES (0);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* DImode can be immediate for TARGET_64BIT
|
|
|
|
and SImode always. */
|
|
|
|
igain += m * COSTS_N_INSNS (1);
|
|
|
|
igain -= vector_const_cost (src);
|
|
|
|
}
|
|
|
|
}
|
2021-07-01 10:56:32 +02:00
|
|
|
else if (MEM_P (dst))
|
2021-08-24 04:04:48 +02:00
|
|
|
{
|
|
|
|
igain += (m * ix86_cost->int_store[2]
|
|
|
|
- ix86_cost->sse_store[sse_cost_idx]);
|
|
|
|
igain -= vector_const_cost (src);
|
|
|
|
}
|
2021-07-01 10:56:32 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
gcc_unreachable ();
|
|
|
|
}
|
2019-08-14 10:31:54 +02:00
|
|
|
|
|
|
|
if (igain != 0 && dump_file)
|
|
|
|
{
|
|
|
|
fprintf (dump_file, " Instruction gain %d for ", igain);
|
|
|
|
dump_insn_slim (dump_file, insn);
|
|
|
|
}
|
|
|
|
gain += igain;
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
|
|
|
|
|
2019-08-27 14:46:07 +02:00
|
|
|
/* Cost the integer to sse and sse to integer moves. */
|
|
|
|
cost += n_sse_to_integer * ix86_cost->sse_to_integer;
|
|
|
|
/* ??? integer_to_sse but we only have that in the RA cost table.
|
|
|
|
Assume sse_to_integer/integer_to_sse are the same which they
|
|
|
|
are at the moment. */
|
|
|
|
cost += n_integer_to_sse * ix86_cost->sse_to_integer;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, " Registers conversion cost: %d\n", cost);
|
|
|
|
|
|
|
|
gain -= cost;
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, " Total gain: %d\n", gain);
|
|
|
|
|
|
|
|
return gain;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Insert generated conversion instruction sequence INSNS
|
|
|
|
after instruction AFTER. New BB may be required in case
|
|
|
|
instruction has EH region attached. */
|
|
|
|
|
|
|
|
void
|
|
|
|
scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
|
|
|
|
{
|
|
|
|
if (!control_flow_insn_p (after))
|
|
|
|
{
|
|
|
|
emit_insn_after (insns, after);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
basic_block bb = BLOCK_FOR_INSN (after);
|
|
|
|
edge e = find_fallthru_edge (bb->succs);
|
|
|
|
gcc_assert (e);
|
|
|
|
|
|
|
|
basic_block new_bb = split_edge (e);
|
|
|
|
emit_insn_after (insns, BB_HEAD (new_bb));
|
|
|
|
}
|
|
|
|
|
2019-08-27 09:39:34 +02:00
|
|
|
} // anon namespace
|
|
|
|
|
2019-08-15 14:44:23 +02:00
|
|
|
/* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
|
|
|
|
zeroing the upper parts. */
|
|
|
|
|
|
|
|
static rtx
|
|
|
|
gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
|
|
|
|
{
|
|
|
|
switch (GET_MODE_NUNITS (vmode))
|
|
|
|
{
|
|
|
|
case 1:
|
PR target/70321: Split double word equality/inequality after STV on x86.
This patch resolves the last piece of PR target/70321 a code quality
(P2 regression) affecting mainline. Currently, for HJ's testcase:
void foo (long long ixi)
{
if (ixi != 14348907)
__builtin_abort ();
}
GCC with -m32 -O2 generates four instructions for the comparison:
movl 16(%esp), %eax
movl 20(%esp), %edx
xorl $14348907, %eax
orl %eax, %edx
but with this patch it now requires only three, making better use of
x86's addressing modes:
movl 16(%esp), %eax
xorl $14348907, %eax
orl 20(%esp), %eax
The solution is to expand "doubleword" equality/inequality expressions
using flag setting COMPARE instructions for the early RTL passes, and
then split them during split1, after STV and before reload.
Hence on x86_64, we now see/allow things like:
(insn 11 8 12 2 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:TI 84 [ x ])
(reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword}
This allows the STV pass to decide whether it's preferrable to perform
this comparison using vector operations, i.e. a pxor/ptest sequence,
or as scalar integer operations, i.e. a xor/xor/or sequence. Alas
this required tweaking of the STV pass to recognize the "new" form of
these comparisons and split out the pxor operation itself. To confirm
this still works as expected I've added a new STV test case:
long long a[1024];
long long b[1024];
int foo()
{
for (int i=0; i<1024; i++)
{
long long t = (a[i]<<8) | (b[i]<<24);
if (t == 0)
return 1;
}
return 0;
}
where with -m32 -O2 -msse4.1 the above comparison with zero should look
like:
punpcklqdq %xmm0, %xmm0
ptest %xmm0, %xmm0
Although this patch includes one or two minor tweaks to provide all the
necessary infrastructure to support conversion of TImode comparisons to
V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement
these transformations, but this is something that can be considered after
stage 4. Indeed the new convert_compare functionality is split out
into a method to simplify its potential reuse by the timode_scalar_chain
class.
2022-05-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/70321
* config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose
DI mode equality/inequality using XOR here. Instead generate a
COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode).
* config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use
gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode.
(general_scalar_chain::convert_compare): New function to convert
scalar equality/inequality comparison into vector operations.
(general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call
new convert_compare helper method.
(convertible_comparion_p): Update to match doubleword COMPARE
of two register, memory or integer constant operands.
* config/i386/i386-features.h (general_scalar_chain::convert_compare):
Prototype/declare member function here.
* config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but
only allow new doubleword modes for EQ and NE operators.
(*cmp<dwi>_doubleword): New define_insn_and_split, to split a
doubleword comparison into a pair of XORs followed by an IOR to
set the (zero) flags register, optimizing the XORs if possible.
* config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode
iterator; V_AVX is (currently) only used by ptest.
(sse4_1 mode attribute): Update to support V1TI and V2TI.
gcc/testsuite/ChangeLog
PR target/70321
* gcc.target/i386/pr70321.c: New test case.
* gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
|
|
|
return gen_rtx_SUBREG (vmode, gpr, 0);
|
2019-08-15 14:44:23 +02:00
|
|
|
case 2:
|
|
|
|
return gen_rtx_VEC_CONCAT (vmode, gpr,
|
|
|
|
CONST0_RTX (GET_MODE_INNER (vmode)));
|
|
|
|
default:
|
|
|
|
return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
|
|
|
|
CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
/* Make vector copies for all register REGNO definitions
|
|
|
|
and replace its uses in a chain. */
|
|
|
|
|
|
|
|
void
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2019-08-27 14:46:07 +02:00
|
|
|
rtx vreg = *defs_map.get (reg);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-08-27 14:46:07 +02:00
|
|
|
start_sequence ();
|
|
|
|
if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2019-08-27 14:46:07 +02:00
|
|
|
rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
|
|
|
|
if (smode == DImode && !TARGET_64BIT)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2019-08-27 14:46:07 +02:00
|
|
|
emit_move_insn (adjust_address (tmp, SImode, 0),
|
|
|
|
gen_rtx_SUBREG (SImode, reg, 0));
|
|
|
|
emit_move_insn (adjust_address (tmp, SImode, 4),
|
|
|
|
gen_rtx_SUBREG (SImode, reg, 4));
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
2019-08-27 14:46:07 +02:00
|
|
|
else
|
|
|
|
emit_move_insn (copy_rtx (tmp), reg);
|
|
|
|
emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
|
|
|
|
gen_gpr_to_xmm_move_src (vmode, tmp)));
|
|
|
|
}
|
|
|
|
else if (!TARGET_64BIT && smode == DImode)
|
|
|
|
{
|
|
|
|
if (TARGET_SSE4_1)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2019-08-27 14:46:07 +02:00
|
|
|
emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
|
|
|
|
CONST0_RTX (V4SImode),
|
|
|
|
gen_rtx_SUBREG (SImode, reg, 0)));
|
|
|
|
emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
|
|
|
|
gen_rtx_SUBREG (V4SImode, vreg, 0),
|
|
|
|
gen_rtx_SUBREG (SImode, reg, 4),
|
|
|
|
GEN_INT (2)));
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
2019-08-26 12:35:59 +02:00
|
|
|
else
|
2019-08-27 14:46:07 +02:00
|
|
|
{
|
|
|
|
rtx tmp = gen_reg_rtx (DImode);
|
|
|
|
emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
|
|
|
|
CONST0_RTX (V4SImode),
|
|
|
|
gen_rtx_SUBREG (SImode, reg, 0)));
|
|
|
|
emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
|
|
|
|
CONST0_RTX (V4SImode),
|
|
|
|
gen_rtx_SUBREG (SImode, reg, 4)));
|
|
|
|
emit_insn (gen_vec_interleave_lowv4si
|
|
|
|
(gen_rtx_SUBREG (V4SImode, vreg, 0),
|
|
|
|
gen_rtx_SUBREG (V4SImode, vreg, 0),
|
|
|
|
gen_rtx_SUBREG (V4SImode, tmp, 0)));
|
|
|
|
}
|
2019-08-26 12:35:59 +02:00
|
|
|
}
|
2019-08-27 14:46:07 +02:00
|
|
|
else
|
|
|
|
emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
|
|
|
|
gen_gpr_to_xmm_move_src (vmode, reg)));
|
|
|
|
rtx_insn *seq = get_insns ();
|
|
|
|
end_sequence ();
|
|
|
|
emit_conversion_insns (seq, insn);
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file,
|
|
|
|
" Copied r%d to a vector register r%d for insn %d\n",
|
|
|
|
REGNO (reg), REGNO (vreg), INSN_UID (insn));
|
2019-08-26 12:35:59 +02:00
|
|
|
}
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-08-26 12:35:59 +02:00
|
|
|
/* Copy the definition SRC of INSN inside the chain to DST for
|
|
|
|
scalar uses outside of the chain. */
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-08-26 12:35:59 +02:00
|
|
|
void
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
|
2019-08-26 12:35:59 +02:00
|
|
|
{
|
|
|
|
start_sequence ();
|
|
|
|
if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
|
|
|
|
{
|
|
|
|
rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
|
|
|
|
emit_move_insn (tmp, src);
|
|
|
|
if (!TARGET_64BIT && smode == DImode)
|
|
|
|
{
|
|
|
|
emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
|
|
|
|
adjust_address (tmp, SImode, 0));
|
|
|
|
emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
|
|
|
|
adjust_address (tmp, SImode, 4));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
emit_move_insn (dst, copy_rtx (tmp));
|
|
|
|
}
|
|
|
|
else if (!TARGET_64BIT && smode == DImode)
|
|
|
|
{
|
|
|
|
if (TARGET_SSE4_1)
|
|
|
|
{
|
|
|
|
rtx tmp = gen_rtx_PARALLEL (VOIDmode,
|
|
|
|
gen_rtvec (1, const0_rtx));
|
|
|
|
emit_insn
|
|
|
|
(gen_rtx_SET
|
|
|
|
(gen_rtx_SUBREG (SImode, dst, 0),
|
|
|
|
gen_rtx_VEC_SELECT (SImode,
|
|
|
|
gen_rtx_SUBREG (V4SImode, src, 0),
|
|
|
|
tmp)));
|
|
|
|
|
|
|
|
tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
|
|
|
|
emit_insn
|
|
|
|
(gen_rtx_SET
|
|
|
|
(gen_rtx_SUBREG (SImode, dst, 4),
|
|
|
|
gen_rtx_VEC_SELECT (SImode,
|
|
|
|
gen_rtx_SUBREG (V4SImode, src, 0),
|
|
|
|
tmp)));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
rtx vcopy = gen_reg_rtx (V2DImode);
|
|
|
|
emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
|
|
|
|
emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
|
|
|
|
gen_rtx_SUBREG (SImode, vcopy, 0));
|
|
|
|
emit_move_insn (vcopy,
|
|
|
|
gen_rtx_LSHIFTRT (V2DImode,
|
|
|
|
vcopy, GEN_INT (32)));
|
|
|
|
emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
|
|
|
|
gen_rtx_SUBREG (SImode, vcopy, 0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
emit_move_insn (dst, src);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-08-26 12:35:59 +02:00
|
|
|
rtx_insn *seq = get_insns ();
|
|
|
|
end_sequence ();
|
|
|
|
emit_conversion_insns (seq, insn);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-08-26 12:35:59 +02:00
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file,
|
|
|
|
" Copied r%d to a scalar register r%d for insn %d\n",
|
|
|
|
REGNO (src), REGNO (dst), INSN_UID (insn));
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Convert operand OP in INSN. We should handle
|
|
|
|
memory operands and uninitialized registers.
|
|
|
|
All other register uses are converted during
|
|
|
|
registers conversion. */
|
|
|
|
|
|
|
|
void
|
2019-08-14 14:04:05 +02:00
|
|
|
general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
*op = copy_rtx_if_shared (*op);
|
|
|
|
|
|
|
|
if (GET_CODE (*op) == NOT)
|
|
|
|
{
|
|
|
|
convert_op (&XEXP (*op, 0), insn);
|
2019-08-14 14:04:05 +02:00
|
|
|
PUT_MODE (*op, vmode);
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
else if (MEM_P (*op))
|
|
|
|
{
|
2019-08-14 14:04:05 +02:00
|
|
|
rtx tmp = gen_reg_rtx (GET_MODE (*op));
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-09-20 13:14:34 +02:00
|
|
|
/* Handle movabs. */
|
|
|
|
if (!memory_operand (*op, GET_MODE (*op)))
|
|
|
|
{
|
|
|
|
rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
|
|
|
|
|
|
|
|
emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
|
|
|
|
*op = tmp2;
|
|
|
|
}
|
|
|
|
|
2019-08-20 10:45:56 +02:00
|
|
|
emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
|
|
|
|
gen_gpr_to_xmm_move_src (vmode, *op)),
|
|
|
|
insn);
|
2019-08-14 14:04:05 +02:00
|
|
|
*op = gen_rtx_SUBREG (vmode, tmp, 0);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
|
|
|
|
INSN_UID (insn), REGNO (tmp));
|
|
|
|
}
|
|
|
|
else if (REG_P (*op))
|
|
|
|
{
|
2019-08-14 14:04:05 +02:00
|
|
|
*op = gen_rtx_SUBREG (vmode, *op, 0);
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
else if (CONST_INT_P (*op))
|
|
|
|
{
|
|
|
|
rtx vec_cst;
|
2019-08-14 14:04:05 +02:00
|
|
|
rtx tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
/* Prefer all ones vector in case of -1. */
|
|
|
|
if (constm1_operand (*op, GET_MODE (*op)))
|
2019-08-14 14:04:05 +02:00
|
|
|
vec_cst = CONSTM1_RTX (vmode);
|
2019-05-06 09:18:26 +02:00
|
|
|
else
|
2019-08-14 14:04:05 +02:00
|
|
|
{
|
|
|
|
unsigned n = GET_MODE_NUNITS (vmode);
|
|
|
|
rtx *v = XALLOCAVEC (rtx, n);
|
|
|
|
v[0] = *op;
|
|
|
|
for (unsigned i = 1; i < n; ++i)
|
|
|
|
v[i] = const0_rtx;
|
|
|
|
vec_cst = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
|
|
|
|
}
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
if (!standard_sse_constant_p (vec_cst, vmode))
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
start_sequence ();
|
2019-08-14 14:04:05 +02:00
|
|
|
vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
|
2019-05-06 09:18:26 +02:00
|
|
|
rtx_insn *seq = get_insns ();
|
|
|
|
end_sequence ();
|
|
|
|
emit_insn_before (seq, insn);
|
|
|
|
}
|
|
|
|
|
|
|
|
emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
|
|
|
|
*op = tmp;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
gcc_assert (SUBREG_P (*op));
|
2019-08-14 14:04:05 +02:00
|
|
|
gcc_assert (GET_MODE (*op) == vmode);
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
PR target/70321: Split double word equality/inequality after STV on x86.
This patch resolves the last piece of PR target/70321 a code quality
(P2 regression) affecting mainline. Currently, for HJ's testcase:
void foo (long long ixi)
{
if (ixi != 14348907)
__builtin_abort ();
}
GCC with -m32 -O2 generates four instructions for the comparison:
movl 16(%esp), %eax
movl 20(%esp), %edx
xorl $14348907, %eax
orl %eax, %edx
but with this patch it now requires only three, making better use of
x86's addressing modes:
movl 16(%esp), %eax
xorl $14348907, %eax
orl 20(%esp), %eax
The solution is to expand "doubleword" equality/inequality expressions
using flag setting COMPARE instructions for the early RTL passes, and
then split them during split1, after STV and before reload.
Hence on x86_64, we now see/allow things like:
(insn 11 8 12 2 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:TI 84 [ x ])
(reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword}
This allows the STV pass to decide whether it's preferrable to perform
this comparison using vector operations, i.e. a pxor/ptest sequence,
or as scalar integer operations, i.e. a xor/xor/or sequence. Alas
this required tweaking of the STV pass to recognize the "new" form of
these comparisons and split out the pxor operation itself. To confirm
this still works as expected I've added a new STV test case:
long long a[1024];
long long b[1024];
int foo()
{
for (int i=0; i<1024; i++)
{
long long t = (a[i]<<8) | (b[i]<<24);
if (t == 0)
return 1;
}
return 0;
}
where with -m32 -O2 -msse4.1 the above comparison with zero should look
like:
punpcklqdq %xmm0, %xmm0
ptest %xmm0, %xmm0
Although this patch includes one or two minor tweaks to provide all the
necessary infrastructure to support conversion of TImode comparisons to
V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement
these transformations, but this is something that can be considered after
stage 4. Indeed the new convert_compare functionality is split out
into a method to simplify its potential reuse by the timode_scalar_chain
class.
2022-05-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/70321
* config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose
DI mode equality/inequality using XOR here. Instead generate a
COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode).
* config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use
gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode.
(general_scalar_chain::convert_compare): New function to convert
scalar equality/inequality comparison into vector operations.
(general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call
new convert_compare helper method.
(convertible_comparion_p): Update to match doubleword COMPARE
of two register, memory or integer constant operands.
* config/i386/i386-features.h (general_scalar_chain::convert_compare):
Prototype/declare member function here.
* config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but
only allow new doubleword modes for EQ and NE operators.
(*cmp<dwi>_doubleword): New define_insn_and_split, to split a
doubleword comparison into a pair of XORs followed by an IOR to
set the (zero) flags register, optimizing the XORs if possible.
* config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode
iterator; V_AVX is (currently) only used by ptest.
(sse4_1 mode attribute): Update to support V1TI and V2TI.
gcc/testsuite/ChangeLog
PR target/70321
* gcc.target/i386/pr70321.c: New test case.
* gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
|
|
|
/* Convert COMPARE to vector mode. */
|
|
|
|
|
|
|
|
rtx
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
|
PR target/70321: Split double word equality/inequality after STV on x86.
This patch resolves the last piece of PR target/70321 a code quality
(P2 regression) affecting mainline. Currently, for HJ's testcase:
void foo (long long ixi)
{
if (ixi != 14348907)
__builtin_abort ();
}
GCC with -m32 -O2 generates four instructions for the comparison:
movl 16(%esp), %eax
movl 20(%esp), %edx
xorl $14348907, %eax
orl %eax, %edx
but with this patch it now requires only three, making better use of
x86's addressing modes:
movl 16(%esp), %eax
xorl $14348907, %eax
orl 20(%esp), %eax
The solution is to expand "doubleword" equality/inequality expressions
using flag setting COMPARE instructions for the early RTL passes, and
then split them during split1, after STV and before reload.
Hence on x86_64, we now see/allow things like:
(insn 11 8 12 2 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:TI 84 [ x ])
(reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword}
This allows the STV pass to decide whether it's preferrable to perform
this comparison using vector operations, i.e. a pxor/ptest sequence,
or as scalar integer operations, i.e. a xor/xor/or sequence. Alas
this required tweaking of the STV pass to recognize the "new" form of
these comparisons and split out the pxor operation itself. To confirm
this still works as expected I've added a new STV test case:
long long a[1024];
long long b[1024];
int foo()
{
for (int i=0; i<1024; i++)
{
long long t = (a[i]<<8) | (b[i]<<24);
if (t == 0)
return 1;
}
return 0;
}
where with -m32 -O2 -msse4.1 the above comparison with zero should look
like:
punpcklqdq %xmm0, %xmm0
ptest %xmm0, %xmm0
Although this patch includes one or two minor tweaks to provide all the
necessary infrastructure to support conversion of TImode comparisons to
V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement
these transformations, but this is something that can be considered after
stage 4. Indeed the new convert_compare functionality is split out
into a method to simplify its potential reuse by the timode_scalar_chain
class.
2022-05-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/70321
* config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose
DI mode equality/inequality using XOR here. Instead generate a
COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode).
* config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use
gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode.
(general_scalar_chain::convert_compare): New function to convert
scalar equality/inequality comparison into vector operations.
(general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call
new convert_compare helper method.
(convertible_comparion_p): Update to match doubleword COMPARE
of two register, memory or integer constant operands.
* config/i386/i386-features.h (general_scalar_chain::convert_compare):
Prototype/declare member function here.
* config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but
only allow new doubleword modes for EQ and NE operators.
(*cmp<dwi>_doubleword): New define_insn_and_split, to split a
doubleword comparison into a pair of XORs followed by an IOR to
set the (zero) flags register, optimizing the XORs if possible.
* config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode
iterator; V_AVX is (currently) only used by ptest.
(sse4_1 mode attribute): Update to support V1TI and V2TI.
gcc/testsuite/ChangeLog
PR target/70321
* gcc.target/i386/pr70321.c: New test case.
* gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
|
|
|
{
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
rtx src, tmp;
|
PR target/70321: Split double word equality/inequality after STV on x86.
This patch resolves the last piece of PR target/70321 a code quality
(P2 regression) affecting mainline. Currently, for HJ's testcase:
void foo (long long ixi)
{
if (ixi != 14348907)
__builtin_abort ();
}
GCC with -m32 -O2 generates four instructions for the comparison:
movl 16(%esp), %eax
movl 20(%esp), %edx
xorl $14348907, %eax
orl %eax, %edx
but with this patch it now requires only three, making better use of
x86's addressing modes:
movl 16(%esp), %eax
xorl $14348907, %eax
orl 20(%esp), %eax
The solution is to expand "doubleword" equality/inequality expressions
using flag setting COMPARE instructions for the early RTL passes, and
then split them during split1, after STV and before reload.
Hence on x86_64, we now see/allow things like:
(insn 11 8 12 2 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:TI 84 [ x ])
(reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword}
This allows the STV pass to decide whether it's preferrable to perform
this comparison using vector operations, i.e. a pxor/ptest sequence,
or as scalar integer operations, i.e. a xor/xor/or sequence. Alas
this required tweaking of the STV pass to recognize the "new" form of
these comparisons and split out the pxor operation itself. To confirm
this still works as expected I've added a new STV test case:
long long a[1024];
long long b[1024];
int foo()
{
for (int i=0; i<1024; i++)
{
long long t = (a[i]<<8) | (b[i]<<24);
if (t == 0)
return 1;
}
return 0;
}
where with -m32 -O2 -msse4.1 the above comparison with zero should look
like:
punpcklqdq %xmm0, %xmm0
ptest %xmm0, %xmm0
Although this patch includes one or two minor tweaks to provide all the
necessary infrastructure to support conversion of TImode comparisons to
V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement
these transformations, but this is something that can be considered after
stage 4. Indeed the new convert_compare functionality is split out
into a method to simplify its potential reuse by the timode_scalar_chain
class.
2022-05-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/70321
* config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose
DI mode equality/inequality using XOR here. Instead generate a
COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode).
* config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use
gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode.
(general_scalar_chain::convert_compare): New function to convert
scalar equality/inequality comparison into vector operations.
(general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call
new convert_compare helper method.
(convertible_comparion_p): Update to match doubleword COMPARE
of two register, memory or integer constant operands.
* config/i386/i386-features.h (general_scalar_chain::convert_compare):
Prototype/declare member function here.
* config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but
only allow new doubleword modes for EQ and NE operators.
(*cmp<dwi>_doubleword): New define_insn_and_split, to split a
doubleword comparison into a pair of XORs followed by an IOR to
set the (zero) flags register, optimizing the XORs if possible.
* config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode
iterator; V_AVX is (currently) only used by ptest.
(sse4_1 mode attribute): Update to support V1TI and V2TI.
gcc/testsuite/ChangeLog
PR target/70321
* gcc.target/i386/pr70321.c: New test case.
* gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
|
|
|
/* Comparison against anything other than zero, requires an XOR. */
|
|
|
|
if (op2 != const0_rtx)
|
|
|
|
{
|
2022-07-09 10:02:14 +02:00
|
|
|
convert_op (&op1, insn);
|
PR target/70321: Split double word equality/inequality after STV on x86.
This patch resolves the last piece of PR target/70321 a code quality
(P2 regression) affecting mainline. Currently, for HJ's testcase:
void foo (long long ixi)
{
if (ixi != 14348907)
__builtin_abort ();
}
GCC with -m32 -O2 generates four instructions for the comparison:
movl 16(%esp), %eax
movl 20(%esp), %edx
xorl $14348907, %eax
orl %eax, %edx
but with this patch it now requires only three, making better use of
x86's addressing modes:
movl 16(%esp), %eax
xorl $14348907, %eax
orl 20(%esp), %eax
The solution is to expand "doubleword" equality/inequality expressions
using flag setting COMPARE instructions for the early RTL passes, and
then split them during split1, after STV and before reload.
Hence on x86_64, we now see/allow things like:
(insn 11 8 12 2 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:TI 84 [ x ])
(reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword}
This allows the STV pass to decide whether it's preferrable to perform
this comparison using vector operations, i.e. a pxor/ptest sequence,
or as scalar integer operations, i.e. a xor/xor/or sequence. Alas
this required tweaking of the STV pass to recognize the "new" form of
these comparisons and split out the pxor operation itself. To confirm
this still works as expected I've added a new STV test case:
long long a[1024];
long long b[1024];
int foo()
{
for (int i=0; i<1024; i++)
{
long long t = (a[i]<<8) | (b[i]<<24);
if (t == 0)
return 1;
}
return 0;
}
where with -m32 -O2 -msse4.1 the above comparison with zero should look
like:
punpcklqdq %xmm0, %xmm0
ptest %xmm0, %xmm0
Although this patch includes one or two minor tweaks to provide all the
necessary infrastructure to support conversion of TImode comparisons to
V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement
these transformations, but this is something that can be considered after
stage 4. Indeed the new convert_compare functionality is split out
into a method to simplify its potential reuse by the timode_scalar_chain
class.
2022-05-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/70321
* config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose
DI mode equality/inequality using XOR here. Instead generate a
COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode).
* config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use
gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode.
(general_scalar_chain::convert_compare): New function to convert
scalar equality/inequality comparison into vector operations.
(general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call
new convert_compare helper method.
(convertible_comparion_p): Update to match doubleword COMPARE
of two register, memory or integer constant operands.
* config/i386/i386-features.h (general_scalar_chain::convert_compare):
Prototype/declare member function here.
* config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but
only allow new doubleword modes for EQ and NE operators.
(*cmp<dwi>_doubleword): New define_insn_and_split, to split a
doubleword comparison into a pair of XORs followed by an IOR to
set the (zero) flags register, optimizing the XORs if possible.
* config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode
iterator; V_AVX is (currently) only used by ptest.
(sse4_1 mode attribute): Update to support V1TI and V2TI.
gcc/testsuite/ChangeLog
PR target/70321
* gcc.target/i386/pr70321.c: New test case.
* gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
|
|
|
convert_op (&op2, insn);
|
|
|
|
/* If both operands are MEMs, explicitly load the OP1 into TMP. */
|
|
|
|
if (MEM_P (op1) && MEM_P (op2))
|
|
|
|
{
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
tmp = gen_reg_rtx (vmode);
|
PR target/70321: Split double word equality/inequality after STV on x86.
This patch resolves the last piece of PR target/70321 a code quality
(P2 regression) affecting mainline. Currently, for HJ's testcase:
void foo (long long ixi)
{
if (ixi != 14348907)
__builtin_abort ();
}
GCC with -m32 -O2 generates four instructions for the comparison:
movl 16(%esp), %eax
movl 20(%esp), %edx
xorl $14348907, %eax
orl %eax, %edx
but with this patch it now requires only three, making better use of
x86's addressing modes:
movl 16(%esp), %eax
xorl $14348907, %eax
orl 20(%esp), %eax
The solution is to expand "doubleword" equality/inequality expressions
using flag setting COMPARE instructions for the early RTL passes, and
then split them during split1, after STV and before reload.
Hence on x86_64, we now see/allow things like:
(insn 11 8 12 2 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:TI 84 [ x ])
(reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword}
This allows the STV pass to decide whether it's preferrable to perform
this comparison using vector operations, i.e. a pxor/ptest sequence,
or as scalar integer operations, i.e. a xor/xor/or sequence. Alas
this required tweaking of the STV pass to recognize the "new" form of
these comparisons and split out the pxor operation itself. To confirm
this still works as expected I've added a new STV test case:
long long a[1024];
long long b[1024];
int foo()
{
for (int i=0; i<1024; i++)
{
long long t = (a[i]<<8) | (b[i]<<24);
if (t == 0)
return 1;
}
return 0;
}
where with -m32 -O2 -msse4.1 the above comparison with zero should look
like:
punpcklqdq %xmm0, %xmm0
ptest %xmm0, %xmm0
Although this patch includes one or two minor tweaks to provide all the
necessary infrastructure to support conversion of TImode comparisons to
V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement
these transformations, but this is something that can be considered after
stage 4. Indeed the new convert_compare functionality is split out
into a method to simplify its potential reuse by the timode_scalar_chain
class.
2022-05-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/70321
* config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose
DI mode equality/inequality using XOR here. Instead generate a
COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode).
* config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use
gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode.
(general_scalar_chain::convert_compare): New function to convert
scalar equality/inequality comparison into vector operations.
(general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call
new convert_compare helper method.
(convertible_comparion_p): Update to match doubleword COMPARE
of two register, memory or integer constant operands.
* config/i386/i386-features.h (general_scalar_chain::convert_compare):
Prototype/declare member function here.
* config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but
only allow new doubleword modes for EQ and NE operators.
(*cmp<dwi>_doubleword): New define_insn_and_split, to split a
doubleword comparison into a pair of XORs followed by an IOR to
set the (zero) flags register, optimizing the XORs if possible.
* config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode
iterator; V_AVX is (currently) only used by ptest.
(sse4_1 mode attribute): Update to support V1TI and V2TI.
gcc/testsuite/ChangeLog
PR target/70321
* gcc.target/i386/pr70321.c: New test case.
* gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
|
|
|
emit_insn_before (gen_rtx_SET (tmp, op1), insn);
|
|
|
|
src = tmp;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
src = op1;
|
|
|
|
src = gen_rtx_XOR (vmode, src, op2);
|
|
|
|
}
|
2022-07-09 10:02:14 +02:00
|
|
|
else if (GET_CODE (op1) == AND
|
|
|
|
&& GET_CODE (XEXP (op1, 0)) == NOT)
|
|
|
|
{
|
|
|
|
rtx op11 = XEXP (XEXP (op1, 0), 0);
|
|
|
|
rtx op12 = XEXP (op1, 1);
|
|
|
|
convert_op (&op11, insn);
|
|
|
|
convert_op (&op12, insn);
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
if (!REG_P (op11))
|
2022-07-09 10:02:14 +02:00
|
|
|
{
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
tmp = gen_reg_rtx (vmode);
|
2022-07-09 10:02:14 +02:00
|
|
|
emit_insn_before (gen_rtx_SET (tmp, op11), insn);
|
|
|
|
op11 = tmp;
|
|
|
|
}
|
|
|
|
src = gen_rtx_AND (vmode, gen_rtx_NOT (vmode, op11), op12);
|
|
|
|
}
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
else if (GET_CODE (op1) == AND)
|
|
|
|
{
|
|
|
|
rtx op11 = XEXP (op1, 0);
|
|
|
|
rtx op12 = XEXP (op1, 1);
|
|
|
|
convert_op (&op11, insn);
|
|
|
|
convert_op (&op12, insn);
|
|
|
|
if (!REG_P (op11))
|
|
|
|
{
|
|
|
|
tmp = gen_reg_rtx (vmode);
|
|
|
|
emit_insn_before (gen_rtx_SET (tmp, op11), insn);
|
|
|
|
op11 = tmp;
|
|
|
|
}
|
|
|
|
return gen_rtx_UNSPEC (CCmode, gen_rtvec (2, op11, op12),
|
|
|
|
UNSPEC_PTEST);
|
|
|
|
}
|
PR target/70321: Split double word equality/inequality after STV on x86.
This patch resolves the last piece of PR target/70321 a code quality
(P2 regression) affecting mainline. Currently, for HJ's testcase:
void foo (long long ixi)
{
if (ixi != 14348907)
__builtin_abort ();
}
GCC with -m32 -O2 generates four instructions for the comparison:
movl 16(%esp), %eax
movl 20(%esp), %edx
xorl $14348907, %eax
orl %eax, %edx
but with this patch it now requires only three, making better use of
x86's addressing modes:
movl 16(%esp), %eax
xorl $14348907, %eax
orl 20(%esp), %eax
The solution is to expand "doubleword" equality/inequality expressions
using flag setting COMPARE instructions for the early RTL passes, and
then split them during split1, after STV and before reload.
Hence on x86_64, we now see/allow things like:
(insn 11 8 12 2 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:TI 84 [ x ])
(reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword}
This allows the STV pass to decide whether it's preferrable to perform
this comparison using vector operations, i.e. a pxor/ptest sequence,
or as scalar integer operations, i.e. a xor/xor/or sequence. Alas
this required tweaking of the STV pass to recognize the "new" form of
these comparisons and split out the pxor operation itself. To confirm
this still works as expected I've added a new STV test case:
long long a[1024];
long long b[1024];
int foo()
{
for (int i=0; i<1024; i++)
{
long long t = (a[i]<<8) | (b[i]<<24);
if (t == 0)
return 1;
}
return 0;
}
where with -m32 -O2 -msse4.1 the above comparison with zero should look
like:
punpcklqdq %xmm0, %xmm0
ptest %xmm0, %xmm0
Although this patch includes one or two minor tweaks to provide all the
necessary infrastructure to support conversion of TImode comparisons to
V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement
these transformations, but this is something that can be considered after
stage 4. Indeed the new convert_compare functionality is split out
into a method to simplify its potential reuse by the timode_scalar_chain
class.
2022-05-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/70321
* config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose
DI mode equality/inequality using XOR here. Instead generate a
COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode).
* config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use
gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode.
(general_scalar_chain::convert_compare): New function to convert
scalar equality/inequality comparison into vector operations.
(general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call
new convert_compare helper method.
(convertible_comparion_p): Update to match doubleword COMPARE
of two register, memory or integer constant operands.
* config/i386/i386-features.h (general_scalar_chain::convert_compare):
Prototype/declare member function here.
* config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but
only allow new doubleword modes for EQ and NE operators.
(*cmp<dwi>_doubleword): New define_insn_and_split, to split a
doubleword comparison into a pair of XORs followed by an IOR to
set the (zero) flags register, optimizing the XORs if possible.
* config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode
iterator; V_AVX is (currently) only used by ptest.
(sse4_1 mode attribute): Update to support V1TI and V2TI.
gcc/testsuite/ChangeLog
PR target/70321
* gcc.target/i386/pr70321.c: New test case.
* gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
|
|
|
else
|
2022-07-09 10:02:14 +02:00
|
|
|
{
|
|
|
|
convert_op (&op1, insn);
|
|
|
|
src = op1;
|
|
|
|
}
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
|
|
|
|
if (!REG_P (src))
|
|
|
|
{
|
|
|
|
tmp = gen_reg_rtx (vmode);
|
|
|
|
emit_insn_before (gen_rtx_SET (tmp, src), insn);
|
|
|
|
src = tmp;
|
|
|
|
}
|
PR target/70321: Split double word equality/inequality after STV on x86.
This patch resolves the last piece of PR target/70321 a code quality
(P2 regression) affecting mainline. Currently, for HJ's testcase:
void foo (long long ixi)
{
if (ixi != 14348907)
__builtin_abort ();
}
GCC with -m32 -O2 generates four instructions for the comparison:
movl 16(%esp), %eax
movl 20(%esp), %edx
xorl $14348907, %eax
orl %eax, %edx
but with this patch it now requires only three, making better use of
x86's addressing modes:
movl 16(%esp), %eax
xorl $14348907, %eax
orl 20(%esp), %eax
The solution is to expand "doubleword" equality/inequality expressions
using flag setting COMPARE instructions for the early RTL passes, and
then split them during split1, after STV and before reload.
Hence on x86_64, we now see/allow things like:
(insn 11 8 12 2 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:TI 84 [ x ])
(reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword}
This allows the STV pass to decide whether it's preferrable to perform
this comparison using vector operations, i.e. a pxor/ptest sequence,
or as scalar integer operations, i.e. a xor/xor/or sequence. Alas
this required tweaking of the STV pass to recognize the "new" form of
these comparisons and split out the pxor operation itself. To confirm
this still works as expected I've added a new STV test case:
long long a[1024];
long long b[1024];
int foo()
{
for (int i=0; i<1024; i++)
{
long long t = (a[i]<<8) | (b[i]<<24);
if (t == 0)
return 1;
}
return 0;
}
where with -m32 -O2 -msse4.1 the above comparison with zero should look
like:
punpcklqdq %xmm0, %xmm0
ptest %xmm0, %xmm0
Although this patch includes one or two minor tweaks to provide all the
necessary infrastructure to support conversion of TImode comparisons to
V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement
these transformations, but this is something that can be considered after
stage 4. Indeed the new convert_compare functionality is split out
into a method to simplify its potential reuse by the timode_scalar_chain
class.
2022-05-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/70321
* config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose
DI mode equality/inequality using XOR here. Instead generate a
COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode).
* config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use
gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode.
(general_scalar_chain::convert_compare): New function to convert
scalar equality/inequality comparison into vector operations.
(general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call
new convert_compare helper method.
(convertible_comparion_p): Update to match doubleword COMPARE
of two register, memory or integer constant operands.
* config/i386/i386-features.h (general_scalar_chain::convert_compare):
Prototype/declare member function here.
* config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but
only allow new doubleword modes for EQ and NE operators.
(*cmp<dwi>_doubleword): New define_insn_and_split, to split a
doubleword comparison into a pair of XORs followed by an IOR to
set the (zero) flags register, optimizing the XORs if possible.
* config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode
iterator; V_AVX is (currently) only used by ptest.
(sse4_1 mode attribute): Update to support V1TI and V2TI.
gcc/testsuite/ChangeLog
PR target/70321
* gcc.target/i386/pr70321.c: New test case.
* gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
|
|
|
|
|
|
|
if (vmode == V2DImode)
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
{
|
|
|
|
tmp = gen_reg_rtx (vmode);
|
|
|
|
emit_insn_before (gen_vec_interleave_lowv2di (tmp, src, src), insn);
|
|
|
|
src = tmp;
|
|
|
|
}
|
PR target/70321: Split double word equality/inequality after STV on x86.
This patch resolves the last piece of PR target/70321 a code quality
(P2 regression) affecting mainline. Currently, for HJ's testcase:
void foo (long long ixi)
{
if (ixi != 14348907)
__builtin_abort ();
}
GCC with -m32 -O2 generates four instructions for the comparison:
movl 16(%esp), %eax
movl 20(%esp), %edx
xorl $14348907, %eax
orl %eax, %edx
but with this patch it now requires only three, making better use of
x86's addressing modes:
movl 16(%esp), %eax
xorl $14348907, %eax
orl 20(%esp), %eax
The solution is to expand "doubleword" equality/inequality expressions
using flag setting COMPARE instructions for the early RTL passes, and
then split them during split1, after STV and before reload.
Hence on x86_64, we now see/allow things like:
(insn 11 8 12 2 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:TI 84 [ x ])
(reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword}
This allows the STV pass to decide whether it's preferrable to perform
this comparison using vector operations, i.e. a pxor/ptest sequence,
or as scalar integer operations, i.e. a xor/xor/or sequence. Alas
this required tweaking of the STV pass to recognize the "new" form of
these comparisons and split out the pxor operation itself. To confirm
this still works as expected I've added a new STV test case:
long long a[1024];
long long b[1024];
int foo()
{
for (int i=0; i<1024; i++)
{
long long t = (a[i]<<8) | (b[i]<<24);
if (t == 0)
return 1;
}
return 0;
}
where with -m32 -O2 -msse4.1 the above comparison with zero should look
like:
punpcklqdq %xmm0, %xmm0
ptest %xmm0, %xmm0
Although this patch includes one or two minor tweaks to provide all the
necessary infrastructure to support conversion of TImode comparisons to
V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement
these transformations, but this is something that can be considered after
stage 4. Indeed the new convert_compare functionality is split out
into a method to simplify its potential reuse by the timode_scalar_chain
class.
2022-05-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/70321
* config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose
DI mode equality/inequality using XOR here. Instead generate a
COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode).
* config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use
gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode.
(general_scalar_chain::convert_compare): New function to convert
scalar equality/inequality comparison into vector operations.
(general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call
new convert_compare helper method.
(convertible_comparion_p): Update to match doubleword COMPARE
of two register, memory or integer constant operands.
* config/i386/i386-features.h (general_scalar_chain::convert_compare):
Prototype/declare member function here.
* config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but
only allow new doubleword modes for EQ and NE operators.
(*cmp<dwi>_doubleword): New define_insn_and_split, to split a
doubleword comparison into a pair of XORs followed by an IOR to
set the (zero) flags register, optimizing the XORs if possible.
* config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode
iterator; V_AVX is (currently) only used by ptest.
(sse4_1 mode attribute): Update to support V1TI and V2TI.
gcc/testsuite/ChangeLog
PR target/70321
* gcc.target/i386/pr70321.c: New test case.
* gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
|
|
|
else if (vmode == V4SImode)
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
{
|
|
|
|
tmp = gen_reg_rtx (vmode);
|
|
|
|
emit_insn_before (gen_sse2_pshufd (tmp, src, const0_rtx), insn);
|
|
|
|
src = tmp;
|
|
|
|
}
|
|
|
|
|
|
|
|
return gen_rtx_UNSPEC (CCmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
|
PR target/70321: Split double word equality/inequality after STV on x86.
This patch resolves the last piece of PR target/70321 a code quality
(P2 regression) affecting mainline. Currently, for HJ's testcase:
void foo (long long ixi)
{
if (ixi != 14348907)
__builtin_abort ();
}
GCC with -m32 -O2 generates four instructions for the comparison:
movl 16(%esp), %eax
movl 20(%esp), %edx
xorl $14348907, %eax
orl %eax, %edx
but with this patch it now requires only three, making better use of
x86's addressing modes:
movl 16(%esp), %eax
xorl $14348907, %eax
orl 20(%esp), %eax
The solution is to expand "doubleword" equality/inequality expressions
using flag setting COMPARE instructions for the early RTL passes, and
then split them during split1, after STV and before reload.
Hence on x86_64, we now see/allow things like:
(insn 11 8 12 2 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:TI 84 [ x ])
(reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword}
This allows the STV pass to decide whether it's preferrable to perform
this comparison using vector operations, i.e. a pxor/ptest sequence,
or as scalar integer operations, i.e. a xor/xor/or sequence. Alas
this required tweaking of the STV pass to recognize the "new" form of
these comparisons and split out the pxor operation itself. To confirm
this still works as expected I've added a new STV test case:
long long a[1024];
long long b[1024];
int foo()
{
for (int i=0; i<1024; i++)
{
long long t = (a[i]<<8) | (b[i]<<24);
if (t == 0)
return 1;
}
return 0;
}
where with -m32 -O2 -msse4.1 the above comparison with zero should look
like:
punpcklqdq %xmm0, %xmm0
ptest %xmm0, %xmm0
Although this patch includes one or two minor tweaks to provide all the
necessary infrastructure to support conversion of TImode comparisons to
V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement
these transformations, but this is something that can be considered after
stage 4. Indeed the new convert_compare functionality is split out
into a method to simplify its potential reuse by the timode_scalar_chain
class.
2022-05-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/70321
* config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose
DI mode equality/inequality using XOR here. Instead generate a
COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode).
* config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use
gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode.
(general_scalar_chain::convert_compare): New function to convert
scalar equality/inequality comparison into vector operations.
(general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call
new convert_compare helper method.
(convertible_comparion_p): Update to match doubleword COMPARE
of two register, memory or integer constant operands.
* config/i386/i386-features.h (general_scalar_chain::convert_compare):
Prototype/declare member function here.
* config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but
only allow new doubleword modes for EQ and NE operators.
(*cmp<dwi>_doubleword): New define_insn_and_split, to split a
doubleword comparison into a pair of XORs followed by an IOR to
set the (zero) flags register, optimizing the XORs if possible.
* config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode
iterator; V_AVX is (currently) only used by ptest.
(sse4_1 mode attribute): Update to support V1TI and V2TI.
gcc/testsuite/ChangeLog
PR target/70321
* gcc.target/i386/pr70321.c: New test case.
* gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
|
|
|
}
|
|
|
|
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
/* Helper function for converting INSN to vector mode. */
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
void
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
scalar_chain::convert_insn_common (rtx_insn *insn)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2019-08-29 12:30:48 +02:00
|
|
|
/* Generate copies for out-of-chain uses of defs and adjust debug uses. */
|
2019-08-26 12:35:59 +02:00
|
|
|
for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
|
|
|
|
if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
|
|
|
|
{
|
|
|
|
df_link *use;
|
|
|
|
for (use = DF_REF_CHAIN (ref); use; use = use->next)
|
2019-08-29 12:30:48 +02:00
|
|
|
if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
|
|
|
|
&& (DF_REF_REG_MEM_P (use->ref)
|
|
|
|
|| !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
|
2019-08-26 12:35:59 +02:00
|
|
|
break;
|
|
|
|
if (use)
|
|
|
|
convert_reg (insn, DF_REF_REG (ref),
|
|
|
|
*defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
|
2019-08-29 13:59:41 +02:00
|
|
|
else if (MAY_HAVE_DEBUG_BIND_INSNS)
|
2019-08-29 12:30:48 +02:00
|
|
|
{
|
|
|
|
/* If we generated a scalar copy we can leave debug-insns
|
|
|
|
as-is, if not, we have to adjust them. */
|
|
|
|
auto_vec<rtx_insn *, 5> to_reset_debug_insns;
|
|
|
|
for (use = DF_REF_CHAIN (ref); use; use = use->next)
|
|
|
|
if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
|
|
|
|
{
|
|
|
|
rtx_insn *debug_insn = DF_REF_INSN (use->ref);
|
|
|
|
/* If there's a reaching definition outside of the
|
|
|
|
chain we have to reset. */
|
|
|
|
df_link *def;
|
|
|
|
for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
|
|
|
|
if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
|
|
|
|
break;
|
|
|
|
if (def)
|
|
|
|
to_reset_debug_insns.safe_push (debug_insn);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
*DF_REF_REAL_LOC (use->ref)
|
|
|
|
= *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
|
|
|
|
df_insn_rescan (debug_insn);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* Have to do the reset outside of the DF_CHAIN walk to not
|
|
|
|
disrupt it. */
|
|
|
|
while (!to_reset_debug_insns.is_empty ())
|
|
|
|
{
|
|
|
|
rtx_insn *debug_insn = to_reset_debug_insns.pop ();
|
|
|
|
INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
|
|
|
|
df_insn_rescan_debug_internal (debug_insn);
|
|
|
|
}
|
|
|
|
}
|
2019-08-26 12:35:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Replace uses in this insn with the defs we use in the chain. */
|
|
|
|
for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
|
|
|
|
if (!DF_REF_REG_MEM_P (ref))
|
|
|
|
if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
|
|
|
|
{
|
|
|
|
/* Also update a corresponding REG_DEAD note. */
|
|
|
|
rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
|
|
|
|
if (note)
|
|
|
|
XEXP (note, 0) = *vreg;
|
|
|
|
*DF_REF_REAL_LOC (ref) = *vreg;
|
|
|
|
}
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Convert INSN to vector mode. */
|
2019-08-26 12:35:59 +02:00
|
|
|
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
void
|
|
|
|
general_scalar_chain::convert_insn (rtx_insn *insn)
|
|
|
|
{
|
2019-05-06 09:18:26 +02:00
|
|
|
rtx def_set = single_set (insn);
|
|
|
|
rtx src = SET_SRC (def_set);
|
|
|
|
rtx dst = SET_DEST (def_set);
|
|
|
|
rtx subreg;
|
|
|
|
|
|
|
|
if (MEM_P (dst) && !REG_P (src))
|
|
|
|
{
|
|
|
|
/* There are no scalar integer instructions and therefore
|
|
|
|
temporary register usage is required. */
|
2019-08-14 14:04:05 +02:00
|
|
|
rtx tmp = gen_reg_rtx (smode);
|
2019-05-06 09:18:26 +02:00
|
|
|
emit_conversion_insns (gen_move_insn (dst, tmp), insn);
|
2019-08-14 14:04:05 +02:00
|
|
|
dst = gen_rtx_SUBREG (vmode, tmp, 0);
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
else if (REG_P (dst) && GET_MODE (dst) == smode)
|
2019-08-26 12:35:59 +02:00
|
|
|
{
|
|
|
|
/* Replace the definition with a SUBREG to the definition we
|
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64.
This patch resolves PR target/106278 a regression on x86_64 caused by my
recent TImode STV improvements. Now that TImode STV can handle comparisons
such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method
sensibly checks that the mode of the SET_DEST is TImode before setting
it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS.
Hence the current code looks like:
if (GET_MODE (dst) == TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
break;
which checks GET_MODE (dst) before calling PUT_MODE, and when a
change is made updating the REG_EQUAL_NOTE tmp if it exists.
The logical flaw (oversight) is that due to RTL sharing, the destination
of this set may already have been updated to V1TImode, as this chain is
being converted, but we still need to update any REG_EQUAL_NOTE that
still has TImode. Hence the correct code is actually:
if (GET_MODE (dst) == TImode)
{
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
if (GET_MODE (dst) == V1TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
}
break;
While fixing this behavior, I noticed I had some indentation whitespace
issues and some vestigial dead code in this function/method that I've
taken the liberty of cleaning up (as obvious) in this patch.
2022-07-15 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/106278
* config/i386/i386-features.cc (general_scalar_chain::convert_insn):
Fix indentation whitespace.
(timode_scalar_chain::fix_debug_reg_uses): Likewise.
(timode_scalar_chain::convert_insn): Delete dead code.
Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI.
Fix indentation whitespace.
(convertible_comparison_p): Likewise.
(timode_scalar_to_vector_candidate_p): Likewise.
gcc/testsuite/ChangeLog
* gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
|
|
|
use inside the chain. */
|
2019-08-26 12:35:59 +02:00
|
|
|
rtx *vdef = defs_map.get (dst);
|
|
|
|
if (vdef)
|
|
|
|
dst = *vdef;
|
|
|
|
dst = gen_rtx_SUBREG (vmode, dst, 0);
|
|
|
|
/* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
|
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64.
This patch resolves PR target/106278 a regression on x86_64 caused by my
recent TImode STV improvements. Now that TImode STV can handle comparisons
such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method
sensibly checks that the mode of the SET_DEST is TImode before setting
it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS.
Hence the current code looks like:
if (GET_MODE (dst) == TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
break;
which checks GET_MODE (dst) before calling PUT_MODE, and when a
change is made updating the REG_EQUAL_NOTE tmp if it exists.
The logical flaw (oversight) is that due to RTL sharing, the destination
of this set may already have been updated to V1TImode, as this chain is
being converted, but we still need to update any REG_EQUAL_NOTE that
still has TImode. Hence the correct code is actually:
if (GET_MODE (dst) == TImode)
{
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
if (GET_MODE (dst) == V1TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
}
break;
While fixing this behavior, I noticed I had some indentation whitespace
issues and some vestigial dead code in this function/method that I've
taken the liberty of cleaning up (as obvious) in this patch.
2022-07-15 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/106278
* config/i386/i386-features.cc (general_scalar_chain::convert_insn):
Fix indentation whitespace.
(timode_scalar_chain::fix_debug_reg_uses): Likewise.
(timode_scalar_chain::convert_insn): Delete dead code.
Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI.
Fix indentation whitespace.
(convertible_comparison_p): Likewise.
(timode_scalar_to_vector_candidate_p): Likewise.
gcc/testsuite/ChangeLog
* gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
|
|
|
is a non-REG_P. So kill those off. */
|
2019-08-26 12:35:59 +02:00
|
|
|
rtx note = find_reg_equal_equiv_note (insn);
|
|
|
|
if (note)
|
|
|
|
remove_note (insn, note);
|
|
|
|
}
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
switch (GET_CODE (src))
|
|
|
|
{
|
|
|
|
case PLUS:
|
|
|
|
case MINUS:
|
|
|
|
case IOR:
|
|
|
|
case XOR:
|
|
|
|
case AND:
|
2019-08-14 14:04:05 +02:00
|
|
|
case SMAX:
|
|
|
|
case SMIN:
|
|
|
|
case UMAX:
|
|
|
|
case UMIN:
|
2019-05-06 09:18:26 +02:00
|
|
|
convert_op (&XEXP (src, 1), insn);
|
i386: Optimize abs expansion [PR97873]
The patch introduces absM named pattern to generate optimal insn sequence
for CMOVE_TARGET targets. Currently, the expansion goes through neg+max
optabs, and the following code is generated:
movl %edi, %eax
negl %eax
cmpl %edi, %eax
cmovl %edi, %eax
This sequence is unoptimal in two ways. a) The compare instruction is
not needed, since NEG insn sets the sign flag based on the result.
The CMOV can use sign flag to select between negated and original value:
movl %edi, %eax
negl %eax
cmovs %edi, %eax
b) On some targets, CMOV is undesirable due to its performance issues.
In addition to TARGET_EXPAND_ABS bypass, the patch introduces STV
conversion of abs RTX to use PABS SSE insn:
vmovd %edi, %xmm0
vpabsd %xmm0, %xmm0
vmovd %xmm0, %eax
The patch changes compare mode of NEG instruction to CCGOCmode,
which is the same mode as the mode of SUB instruction. IOW, sign bit
becomes usable.
Also, the mode iterator of <maxmin:code><mode>3 pattern is changed
to SWI48x instead of SWI248. The purpose of maxmin expander is to
prepare max/min RTX for STV to eventually convert them to SSE PMAX/PMIN
instructions, in order to *avoid* CMOV insns with general registers.
2020-11-20 Uroš Bizjak <ubizjak@gmail.com>
gcc/
PR target/97873
* config/i386/i386.md (*neg<mode>2_2): Rename from
"*neg<mode>2_cmpz". Use CCGOCmode instead of CCZmode.
(*negsi2_zext): Rename from *negsi2_cmpz_zext.
Use CCGOCmode instead of CCZmode.
(*neg<mode>_ccc_1): New insn pattern.
(*neg<dwi>2_doubleword): Use *neg<mode>_ccc_1.
(abs<mode>2): Add FLAGS_REG clobber.
Use TARGET_CMOVE insn predicate.
(*abs<mode>2_1): New insn_and_split pattern.
(*absdi2_doubleword): Ditto.
(<maxmin:code><mode>3): Use SWI48x mode iterator.
(*<maxmin:code><mode>3): Use SWI48 mode iterator.
* config/i386/i386-features.c
(general_scalar_chain::compute_convert_gain): Handle ABS code.
(general_scalar_chain::convert_insn): Ditto.
(general_scalar_to_vector_candidate_p): Ditto.
gcc/testsuite/
PR target/97873
* gcc.target/i386/pr97873.c: New test.
* gcc.target/i386/pr97873-1.c: New test.
2020-11-20 10:26:34 +01:00
|
|
|
/* FALLTHRU */
|
|
|
|
|
|
|
|
case ABS:
|
|
|
|
case ASHIFT:
|
|
|
|
case ASHIFTRT:
|
|
|
|
case LSHIFTRT:
|
|
|
|
convert_op (&XEXP (src, 0), insn);
|
2019-08-14 14:04:05 +02:00
|
|
|
PUT_MODE (src, vmode);
|
2019-05-06 09:18:26 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
case NEG:
|
|
|
|
src = XEXP (src, 0);
|
2021-07-01 10:56:32 +02:00
|
|
|
|
|
|
|
if (GET_CODE (src) == ABS)
|
|
|
|
{
|
|
|
|
src = XEXP (src, 0);
|
|
|
|
convert_op (&src, insn);
|
|
|
|
subreg = gen_reg_rtx (vmode);
|
|
|
|
emit_insn_before (gen_rtx_SET (subreg,
|
|
|
|
gen_rtx_ABS (vmode, src)), insn);
|
|
|
|
src = subreg;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
convert_op (&src, insn);
|
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
subreg = gen_reg_rtx (vmode);
|
|
|
|
emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
|
|
|
|
src = gen_rtx_MINUS (vmode, subreg, src);
|
2019-05-06 09:18:26 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
case NOT:
|
|
|
|
src = XEXP (src, 0);
|
|
|
|
convert_op (&src, insn);
|
2019-08-14 14:04:05 +02:00
|
|
|
subreg = gen_reg_rtx (vmode);
|
|
|
|
emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
|
|
|
|
src = gen_rtx_XOR (vmode, src, subreg);
|
2019-05-06 09:18:26 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
case MEM:
|
|
|
|
if (!REG_P (dst))
|
|
|
|
convert_op (&src, insn);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case REG:
|
|
|
|
if (!MEM_P (dst))
|
|
|
|
convert_op (&src, insn);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SUBREG:
|
2019-08-14 14:04:05 +02:00
|
|
|
gcc_assert (GET_MODE (src) == vmode);
|
2019-05-06 09:18:26 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
case COMPARE:
|
|
|
|
dst = gen_rtx_REG (CCmode, FLAGS_REG);
|
PR target/70321: Split double word equality/inequality after STV on x86.
This patch resolves the last piece of PR target/70321 a code quality
(P2 regression) affecting mainline. Currently, for HJ's testcase:
void foo (long long ixi)
{
if (ixi != 14348907)
__builtin_abort ();
}
GCC with -m32 -O2 generates four instructions for the comparison:
movl 16(%esp), %eax
movl 20(%esp), %edx
xorl $14348907, %eax
orl %eax, %edx
but with this patch it now requires only three, making better use of
x86's addressing modes:
movl 16(%esp), %eax
xorl $14348907, %eax
orl 20(%esp), %eax
The solution is to expand "doubleword" equality/inequality expressions
using flag setting COMPARE instructions for the early RTL passes, and
then split them during split1, after STV and before reload.
Hence on x86_64, we now see/allow things like:
(insn 11 8 12 2 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:TI 84 [ x ])
(reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword}
This allows the STV pass to decide whether it's preferrable to perform
this comparison using vector operations, i.e. a pxor/ptest sequence,
or as scalar integer operations, i.e. a xor/xor/or sequence. Alas
this required tweaking of the STV pass to recognize the "new" form of
these comparisons and split out the pxor operation itself. To confirm
this still works as expected I've added a new STV test case:
long long a[1024];
long long b[1024];
int foo()
{
for (int i=0; i<1024; i++)
{
long long t = (a[i]<<8) | (b[i]<<24);
if (t == 0)
return 1;
}
return 0;
}
where with -m32 -O2 -msse4.1 the above comparison with zero should look
like:
punpcklqdq %xmm0, %xmm0
ptest %xmm0, %xmm0
Although this patch includes one or two minor tweaks to provide all the
necessary infrastructure to support conversion of TImode comparisons to
V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement
these transformations, but this is something that can be considered after
stage 4. Indeed the new convert_compare functionality is split out
into a method to simplify its potential reuse by the timode_scalar_chain
class.
2022-05-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/70321
* config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose
DI mode equality/inequality using XOR here. Instead generate a
COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode).
* config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use
gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode.
(general_scalar_chain::convert_compare): New function to convert
scalar equality/inequality comparison into vector operations.
(general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call
new convert_compare helper method.
(convertible_comparion_p): Update to match doubleword COMPARE
of two register, memory or integer constant operands.
* config/i386/i386-features.h (general_scalar_chain::convert_compare):
Prototype/declare member function here.
* config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but
only allow new doubleword modes for EQ and NE operators.
(*cmp<dwi>_doubleword): New define_insn_and_split, to split a
doubleword comparison into a pair of XORs followed by an IOR to
set the (zero) flags register, optimizing the XORs if possible.
* config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode
iterator; V_AVX is (currently) only used by ptest.
(sse4_1 mode attribute): Update to support V1TI and V2TI.
gcc/testsuite/ChangeLog
PR target/70321
* gcc.target/i386/pr70321.c: New test case.
* gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
|
|
|
src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
|
2019-05-06 09:18:26 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
case CONST_INT:
|
|
|
|
convert_op (&src, insn);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
gcc_unreachable ();
|
|
|
|
}
|
|
|
|
|
|
|
|
SET_SRC (def_set) = src;
|
|
|
|
SET_DEST (def_set) = dst;
|
|
|
|
|
|
|
|
/* Drop possible dead definitions. */
|
|
|
|
PATTERN (insn) = def_set;
|
|
|
|
|
|
|
|
INSN_CODE (insn) = -1;
|
2019-08-14 14:04:05 +02:00
|
|
|
int patt = recog_memoized (insn);
|
|
|
|
if (patt == -1)
|
|
|
|
fatal_insn_not_found (insn);
|
2019-05-06 09:18:26 +02:00
|
|
|
df_insn_rescan (insn);
|
|
|
|
}
|
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
/* Compute a gain for chain conversion. */
|
|
|
|
|
|
|
|
int
|
|
|
|
timode_scalar_chain::compute_convert_gain ()
|
|
|
|
{
|
|
|
|
/* Assume that if we have to move TImode values between units,
|
|
|
|
then transforming this chain isn't worth it. */
|
|
|
|
if (n_sse_to_integer || n_integer_to_sse)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
bitmap_iterator bi;
|
|
|
|
unsigned insn_uid;
|
|
|
|
|
|
|
|
/* Split ties to prefer V1TImode when not optimizing for size. */
|
|
|
|
int gain = optimize_size ? 0 : 1;
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
|
|
|
|
|
|
|
|
EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
|
|
|
|
{
|
|
|
|
rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
|
|
|
|
rtx def_set = single_set (insn);
|
|
|
|
rtx src = SET_SRC (def_set);
|
|
|
|
rtx dst = SET_DEST (def_set);
|
|
|
|
int igain = 0;
|
|
|
|
|
|
|
|
switch (GET_CODE (src))
|
|
|
|
{
|
|
|
|
case REG:
|
|
|
|
if (optimize_insn_for_size_p ())
|
|
|
|
igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
|
|
|
|
else
|
|
|
|
igain = COSTS_N_INSNS (1);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case MEM:
|
|
|
|
igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (7)
|
|
|
|
: COSTS_N_INSNS (1);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CONST_INT:
|
|
|
|
if (MEM_P (dst)
|
|
|
|
&& standard_sse_constant_p (src, V1TImode))
|
|
|
|
igain = optimize_insn_for_size_p() ? COSTS_N_BYTES (11) : 1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NOT:
|
|
|
|
if (MEM_P (dst))
|
|
|
|
igain = -COSTS_N_INSNS (1);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case AND:
|
|
|
|
case XOR:
|
|
|
|
case IOR:
|
|
|
|
if (!MEM_P (dst))
|
|
|
|
igain = COSTS_N_INSNS (1);
|
|
|
|
break;
|
|
|
|
|
Support logical shifts by (some) integer constants in TImode STV on x86_64.
This patch improves TImode STV by adding support for logical shifts by
integer constants that are multiples of 8. For the test case:
unsigned __int128 a, b;
void foo() { a = b << 16; }
on x86_64, gcc -O2 currently generates:
movq b(%rip), %rax
movq b+8(%rip), %rdx
shldq $16, %rax, %rdx
salq $16, %rax
movq %rax, a(%rip)
movq %rdx, a+8(%rip)
ret
with this patch we now generate:
movdqa b(%rip), %xmm0
pslldq $2, %xmm0
movaps %xmm0, a(%rip)
ret
2022-08-03 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (compute_convert_gain): Add gain
for converting suitable TImode shift to a V1TImode shift.
(timode_scalar_chain::convert_insn): Add support for converting
suitable ASHIFT and LSHIFTRT.
(timode_scalar_to_vector_candidate_p): Consider logical shifts
by integer constants that are multiples of 8 to be candidates.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-7.c: New test case.
2022-08-03 10:00:20 +02:00
|
|
|
case ASHIFT:
|
|
|
|
case LSHIFTRT:
|
|
|
|
/* For logical shifts by constant multiples of 8. */
|
|
|
|
igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (4)
|
|
|
|
: COSTS_N_INSNS (1);
|
|
|
|
break;
|
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (igain != 0 && dump_file)
|
|
|
|
{
|
|
|
|
fprintf (dump_file, " Instruction gain %d for ", igain);
|
|
|
|
dump_insn_slim (dump_file, insn);
|
|
|
|
}
|
|
|
|
gain += igain;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, " Total gain: %d\n", gain);
|
|
|
|
|
|
|
|
return gain;
|
|
|
|
}
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
/* Fix uses of converted REG in debug insns. */
|
|
|
|
|
|
|
|
void
|
|
|
|
timode_scalar_chain::fix_debug_reg_uses (rtx reg)
|
|
|
|
{
|
|
|
|
if (!flag_var_tracking)
|
|
|
|
return;
|
|
|
|
|
|
|
|
df_ref ref, next;
|
|
|
|
for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
|
|
|
|
{
|
|
|
|
rtx_insn *insn = DF_REF_INSN (ref);
|
|
|
|
/* Make sure the next ref is for a different instruction,
|
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64.
This patch resolves PR target/106278 a regression on x86_64 caused by my
recent TImode STV improvements. Now that TImode STV can handle comparisons
such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method
sensibly checks that the mode of the SET_DEST is TImode before setting
it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS.
Hence the current code looks like:
if (GET_MODE (dst) == TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
break;
which checks GET_MODE (dst) before calling PUT_MODE, and when a
change is made updating the REG_EQUAL_NOTE tmp if it exists.
The logical flaw (oversight) is that due to RTL sharing, the destination
of this set may already have been updated to V1TImode, as this chain is
being converted, but we still need to update any REG_EQUAL_NOTE that
still has TImode. Hence the correct code is actually:
if (GET_MODE (dst) == TImode)
{
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
if (GET_MODE (dst) == V1TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
}
break;
While fixing this behavior, I noticed I had some indentation whitespace
issues and some vestigial dead code in this function/method that I've
taken the liberty of cleaning up (as obvious) in this patch.
2022-07-15 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/106278
* config/i386/i386-features.cc (general_scalar_chain::convert_insn):
Fix indentation whitespace.
(timode_scalar_chain::fix_debug_reg_uses): Likewise.
(timode_scalar_chain::convert_insn): Delete dead code.
Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI.
Fix indentation whitespace.
(convertible_comparison_p): Likewise.
(timode_scalar_to_vector_candidate_p): Likewise.
gcc/testsuite/ChangeLog
* gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
|
|
|
so that we're not affected by the rescan. */
|
2019-05-06 09:18:26 +02:00
|
|
|
next = DF_REF_NEXT_REG (ref);
|
|
|
|
while (next && DF_REF_INSN (next) == insn)
|
|
|
|
next = DF_REF_NEXT_REG (next);
|
|
|
|
|
|
|
|
if (DEBUG_INSN_P (insn))
|
|
|
|
{
|
|
|
|
/* It may be a debug insn with a TImode variable in
|
|
|
|
register. */
|
|
|
|
bool changed = false;
|
|
|
|
for (; ref != next; ref = DF_REF_NEXT_REG (ref))
|
|
|
|
{
|
|
|
|
rtx *loc = DF_REF_LOC (ref);
|
|
|
|
if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
|
|
|
|
{
|
|
|
|
*loc = gen_rtx_SUBREG (TImode, *loc, 0);
|
|
|
|
changed = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (changed)
|
|
|
|
df_insn_rescan (insn);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
/* Convert operand OP in INSN from TImode to V1TImode. */
|
|
|
|
|
|
|
|
void
|
|
|
|
timode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
|
|
|
|
{
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
if (GET_MODE (*op) == V1TImode)
|
|
|
|
return;
|
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
*op = copy_rtx_if_shared (*op);
|
|
|
|
|
|
|
|
if (REG_P (*op))
|
|
|
|
*op = gen_rtx_SUBREG (V1TImode, *op, 0);
|
|
|
|
else if (MEM_P (*op))
|
|
|
|
{
|
|
|
|
rtx tmp = gen_reg_rtx (V1TImode);
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
emit_insn_before (gen_rtx_SET (tmp,
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
gen_gpr_to_xmm_move_src (V1TImode, *op)),
|
|
|
|
insn);
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
*op = tmp;
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
|
|
|
|
INSN_UID (insn), REGNO (tmp));
|
|
|
|
}
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
else if (CONST_SCALAR_INT_P (*op))
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
{
|
|
|
|
rtx vec_cst;
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
rtx tmp = gen_reg_rtx (V1TImode);
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
|
|
|
|
/* Prefer all ones vector in case of -1. */
|
|
|
|
if (constm1_operand (*op, TImode))
|
|
|
|
vec_cst = CONSTM1_RTX (V1TImode);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
rtx *v = XALLOCAVEC (rtx, 1);
|
|
|
|
v[0] = *op;
|
|
|
|
vec_cst = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec_v (1, v));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!standard_sse_constant_p (vec_cst, V1TImode))
|
|
|
|
{
|
|
|
|
start_sequence ();
|
|
|
|
vec_cst = validize_mem (force_const_mem (V1TImode, vec_cst));
|
|
|
|
rtx_insn *seq = get_insns ();
|
|
|
|
end_sequence ();
|
|
|
|
emit_insn_before (seq, insn);
|
|
|
|
}
|
|
|
|
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
emit_insn_before (gen_move_insn (tmp, vec_cst), insn);
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
*op = tmp;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
gcc_assert (SUBREG_P (*op));
|
|
|
|
gcc_assert (GET_MODE (*op) == vmode);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
/* Convert INSN from TImode to V1T1mode. */
|
|
|
|
|
|
|
|
void
|
|
|
|
timode_scalar_chain::convert_insn (rtx_insn *insn)
|
|
|
|
{
|
|
|
|
rtx def_set = single_set (insn);
|
|
|
|
rtx src = SET_SRC (def_set);
|
|
|
|
rtx dst = SET_DEST (def_set);
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
rtx tmp;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
switch (GET_CODE (dst))
|
|
|
|
{
|
|
|
|
case REG:
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
if (GET_MODE (dst) == TImode)
|
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64.
This patch resolves PR target/106278 a regression on x86_64 caused by my
recent TImode STV improvements. Now that TImode STV can handle comparisons
such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method
sensibly checks that the mode of the SET_DEST is TImode before setting
it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS.
Hence the current code looks like:
if (GET_MODE (dst) == TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
break;
which checks GET_MODE (dst) before calling PUT_MODE, and when a
change is made updating the REG_EQUAL_NOTE tmp if it exists.
The logical flaw (oversight) is that due to RTL sharing, the destination
of this set may already have been updated to V1TImode, as this chain is
being converted, but we still need to update any REG_EQUAL_NOTE that
still has TImode. Hence the correct code is actually:
if (GET_MODE (dst) == TImode)
{
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
if (GET_MODE (dst) == V1TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
}
break;
While fixing this behavior, I noticed I had some indentation whitespace
issues and some vestigial dead code in this function/method that I've
taken the liberty of cleaning up (as obvious) in this patch.
2022-07-15 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/106278
* config/i386/i386-features.cc (general_scalar_chain::convert_insn):
Fix indentation whitespace.
(timode_scalar_chain::fix_debug_reg_uses): Likewise.
(timode_scalar_chain::convert_insn): Delete dead code.
Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI.
Fix indentation whitespace.
(convertible_comparison_p): Likewise.
(timode_scalar_to_vector_candidate_p): Likewise.
gcc/testsuite/ChangeLog
* gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
|
|
|
{
|
|
|
|
PUT_MODE (dst, V1TImode);
|
|
|
|
fix_debug_reg_uses (dst);
|
|
|
|
}
|
|
|
|
if (GET_MODE (dst) == V1TImode)
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
{
|
|
|
|
tmp = find_reg_equal_equiv_note (insn);
|
2022-08-02 00:08:23 +02:00
|
|
|
if (tmp)
|
|
|
|
{
|
|
|
|
if (GET_MODE (XEXP (tmp, 0)) == TImode)
|
|
|
|
PUT_MODE (XEXP (tmp, 0), V1TImode);
|
|
|
|
else if (CONST_SCALAR_INT_P (XEXP (tmp, 0)))
|
|
|
|
XEXP (tmp, 0)
|
|
|
|
= gen_rtx_CONST_VECTOR (V1TImode,
|
|
|
|
gen_rtvec (1, XEXP (tmp, 0)));
|
|
|
|
}
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
}
|
2019-05-06 09:18:26 +02:00
|
|
|
break;
|
|
|
|
case MEM:
|
|
|
|
PUT_MODE (dst, V1TImode);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
gcc_unreachable ();
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (GET_CODE (src))
|
|
|
|
{
|
|
|
|
case REG:
|
|
|
|
PUT_MODE (src, V1TImode);
|
|
|
|
/* Call fix_debug_reg_uses only if SRC is never defined. */
|
|
|
|
if (!DF_REG_DEF_CHAIN (REGNO (src)))
|
|
|
|
fix_debug_reg_uses (src);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case MEM:
|
|
|
|
PUT_MODE (src, V1TImode);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CONST_WIDE_INT:
|
|
|
|
if (NONDEBUG_INSN_P (insn))
|
|
|
|
{
|
|
|
|
/* Since there are no instructions to store 128-bit constant,
|
|
|
|
temporary register usage is required. */
|
|
|
|
start_sequence ();
|
|
|
|
src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
|
|
|
|
src = validize_mem (force_const_mem (V1TImode, src));
|
|
|
|
rtx_insn *seq = get_insns ();
|
|
|
|
end_sequence ();
|
|
|
|
if (seq)
|
|
|
|
emit_insn_before (seq, insn);
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
if (MEM_P (dst))
|
|
|
|
{
|
|
|
|
tmp = gen_reg_rtx (V1TImode);
|
|
|
|
emit_insn_before (gen_rtx_SET (tmp, src), insn);
|
|
|
|
src = tmp;
|
|
|
|
}
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CONST_INT:
|
|
|
|
switch (standard_sse_constant_p (src, TImode))
|
|
|
|
{
|
|
|
|
case 1:
|
|
|
|
src = CONST0_RTX (GET_MODE (dst));
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
src = CONSTM1_RTX (GET_MODE (dst));
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
gcc_unreachable ();
|
|
|
|
}
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
if (MEM_P (dst))
|
|
|
|
{
|
|
|
|
tmp = gen_reg_rtx (V1TImode);
|
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64.
This patch resolves PR target/106278 a regression on x86_64 caused by my
recent TImode STV improvements. Now that TImode STV can handle comparisons
such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method
sensibly checks that the mode of the SET_DEST is TImode before setting
it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS.
Hence the current code looks like:
if (GET_MODE (dst) == TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
break;
which checks GET_MODE (dst) before calling PUT_MODE, and when a
change is made updating the REG_EQUAL_NOTE tmp if it exists.
The logical flaw (oversight) is that due to RTL sharing, the destination
of this set may already have been updated to V1TImode, as this chain is
being converted, but we still need to update any REG_EQUAL_NOTE that
still has TImode. Hence the correct code is actually:
if (GET_MODE (dst) == TImode)
{
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
if (GET_MODE (dst) == V1TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
}
break;
While fixing this behavior, I noticed I had some indentation whitespace
issues and some vestigial dead code in this function/method that I've
taken the liberty of cleaning up (as obvious) in this patch.
2022-07-15 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/106278
* config/i386/i386-features.cc (general_scalar_chain::convert_insn):
Fix indentation whitespace.
(timode_scalar_chain::fix_debug_reg_uses): Likewise.
(timode_scalar_chain::convert_insn): Delete dead code.
Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI.
Fix indentation whitespace.
(convertible_comparison_p): Likewise.
(timode_scalar_to_vector_candidate_p): Likewise.
gcc/testsuite/ChangeLog
* gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
|
|
|
emit_insn_before (gen_rtx_SET (tmp, src), insn);
|
|
|
|
src = tmp;
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case AND:
|
|
|
|
if (GET_CODE (XEXP (src, 0)) == NOT)
|
|
|
|
{
|
|
|
|
convert_op (&XEXP (XEXP (src, 0), 0), insn);
|
|
|
|
convert_op (&XEXP (src, 1), insn);
|
|
|
|
PUT_MODE (XEXP (src, 0), V1TImode);
|
|
|
|
PUT_MODE (src, V1TImode);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* FALLTHRU */
|
|
|
|
|
|
|
|
case XOR:
|
|
|
|
case IOR:
|
|
|
|
convert_op (&XEXP (src, 0), insn);
|
|
|
|
convert_op (&XEXP (src, 1), insn);
|
|
|
|
PUT_MODE (src, V1TImode);
|
|
|
|
if (MEM_P (dst))
|
|
|
|
{
|
|
|
|
tmp = gen_reg_rtx (V1TImode);
|
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64.
This patch resolves PR target/106278 a regression on x86_64 caused by my
recent TImode STV improvements. Now that TImode STV can handle comparisons
such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method
sensibly checks that the mode of the SET_DEST is TImode before setting
it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS.
Hence the current code looks like:
if (GET_MODE (dst) == TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
break;
which checks GET_MODE (dst) before calling PUT_MODE, and when a
change is made updating the REG_EQUAL_NOTE tmp if it exists.
The logical flaw (oversight) is that due to RTL sharing, the destination
of this set may already have been updated to V1TImode, as this chain is
being converted, but we still need to update any REG_EQUAL_NOTE that
still has TImode. Hence the correct code is actually:
if (GET_MODE (dst) == TImode)
{
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
if (GET_MODE (dst) == V1TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
}
break;
While fixing this behavior, I noticed I had some indentation whitespace
issues and some vestigial dead code in this function/method that I've
taken the liberty of cleaning up (as obvious) in this patch.
2022-07-15 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/106278
* config/i386/i386-features.cc (general_scalar_chain::convert_insn):
Fix indentation whitespace.
(timode_scalar_chain::fix_debug_reg_uses): Likewise.
(timode_scalar_chain::convert_insn): Delete dead code.
Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI.
Fix indentation whitespace.
(convertible_comparison_p): Likewise.
(timode_scalar_to_vector_candidate_p): Likewise.
gcc/testsuite/ChangeLog
* gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
|
|
|
emit_insn_before (gen_rtx_SET (tmp, src), insn);
|
|
|
|
src = tmp;
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NOT:
|
|
|
|
src = XEXP (src, 0);
|
|
|
|
convert_op (&src, insn);
|
|
|
|
tmp = gen_reg_rtx (V1TImode);
|
|
|
|
emit_insn_before (gen_move_insn (tmp, CONSTM1_RTX (V1TImode)), insn);
|
|
|
|
src = gen_rtx_XOR (V1TImode, src, tmp);
|
|
|
|
if (MEM_P (dst))
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
tmp = gen_reg_rtx (V1TImode);
|
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64.
This patch resolves PR target/106278 a regression on x86_64 caused by my
recent TImode STV improvements. Now that TImode STV can handle comparisons
such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method
sensibly checks that the mode of the SET_DEST is TImode before setting
it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS.
Hence the current code looks like:
if (GET_MODE (dst) == TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
break;
which checks GET_MODE (dst) before calling PUT_MODE, and when a
change is made updating the REG_EQUAL_NOTE tmp if it exists.
The logical flaw (oversight) is that due to RTL sharing, the destination
of this set may already have been updated to V1TImode, as this chain is
being converted, but we still need to update any REG_EQUAL_NOTE that
still has TImode. Hence the correct code is actually:
if (GET_MODE (dst) == TImode)
{
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
if (GET_MODE (dst) == V1TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
}
break;
While fixing this behavior, I noticed I had some indentation whitespace
issues and some vestigial dead code in this function/method that I've
taken the liberty of cleaning up (as obvious) in this patch.
2022-07-15 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/106278
* config/i386/i386-features.cc (general_scalar_chain::convert_insn):
Fix indentation whitespace.
(timode_scalar_chain::fix_debug_reg_uses): Likewise.
(timode_scalar_chain::convert_insn): Delete dead code.
Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI.
Fix indentation whitespace.
(convertible_comparison_p): Likewise.
(timode_scalar_to_vector_candidate_p): Likewise.
gcc/testsuite/ChangeLog
* gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
|
|
|
emit_insn_before (gen_rtx_SET (tmp, src), insn);
|
|
|
|
src = tmp;
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
case COMPARE:
|
|
|
|
dst = gen_rtx_REG (CCmode, FLAGS_REG);
|
|
|
|
src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
|
|
|
|
break;
|
|
|
|
|
Support logical shifts by (some) integer constants in TImode STV on x86_64.
This patch improves TImode STV by adding support for logical shifts by
integer constants that are multiples of 8. For the test case:
unsigned __int128 a, b;
void foo() { a = b << 16; }
on x86_64, gcc -O2 currently generates:
movq b(%rip), %rax
movq b+8(%rip), %rdx
shldq $16, %rax, %rdx
salq $16, %rax
movq %rax, a(%rip)
movq %rdx, a+8(%rip)
ret
with this patch we now generate:
movdqa b(%rip), %xmm0
pslldq $2, %xmm0
movaps %xmm0, a(%rip)
ret
2022-08-03 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (compute_convert_gain): Add gain
for converting suitable TImode shift to a V1TImode shift.
(timode_scalar_chain::convert_insn): Add support for converting
suitable ASHIFT and LSHIFTRT.
(timode_scalar_to_vector_candidate_p): Consider logical shifts
by integer constants that are multiples of 8 to be candidates.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-7.c: New test case.
2022-08-03 10:00:20 +02:00
|
|
|
case ASHIFT:
|
|
|
|
case LSHIFTRT:
|
|
|
|
convert_op (&XEXP (src, 0), insn);
|
|
|
|
PUT_MODE (src, V1TImode);
|
|
|
|
break;
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
default:
|
|
|
|
gcc_unreachable ();
|
|
|
|
}
|
|
|
|
|
|
|
|
SET_SRC (def_set) = src;
|
|
|
|
SET_DEST (def_set) = dst;
|
|
|
|
|
|
|
|
/* Drop possible dead definitions. */
|
|
|
|
PATTERN (insn) = def_set;
|
|
|
|
|
|
|
|
INSN_CODE (insn) = -1;
|
|
|
|
recog_memoized (insn);
|
|
|
|
df_insn_rescan (insn);
|
|
|
|
}
|
|
|
|
|
2019-08-26 12:35:59 +02:00
|
|
|
/* Generate copies from defs used by the chain but not defined therein.
|
|
|
|
Also populates defs_map which is used later by convert_insn. */
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
void
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
scalar_chain::convert_registers ()
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
bitmap_iterator bi;
|
|
|
|
unsigned id;
|
2019-08-26 12:35:59 +02:00
|
|
|
EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
|
2019-09-20 08:42:39 +02:00
|
|
|
{
|
|
|
|
rtx chain_reg = gen_reg_rtx (smode);
|
|
|
|
defs_map.put (regno_reg_rtx[id], chain_reg);
|
|
|
|
}
|
2019-08-27 14:46:07 +02:00
|
|
|
EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
|
|
|
|
for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
|
|
|
|
if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
|
|
|
|
make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Convert whole chain creating required register
|
|
|
|
conversions and copies. */
|
|
|
|
|
|
|
|
int
|
|
|
|
scalar_chain::convert ()
|
|
|
|
{
|
|
|
|
bitmap_iterator bi;
|
|
|
|
unsigned id;
|
|
|
|
int converted_insns = 0;
|
|
|
|
|
|
|
|
if (!dbg_cnt (stv_conversion))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, "Converting chain #%d...\n", chain_id);
|
|
|
|
|
|
|
|
convert_registers ();
|
|
|
|
|
|
|
|
EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
|
|
|
|
{
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
|
|
|
|
convert_insn_common (insn);
|
|
|
|
convert_insn (insn);
|
2019-05-06 09:18:26 +02:00
|
|
|
converted_insns++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return converted_insns;
|
|
|
|
}
|
|
|
|
|
x86: Allow V1TI vector register pushes
Add V1TI vector register push and split it after reload to a sequence
of:
(set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))
(set (match_dup 0) (match_dup 1))
so that STV pass can convert TI mode integer push to V1TI vector register
push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls
of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore
pseudo register push.
Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with
-mpreferred-stack-boundary=2 and leads to segfault:
Dump of assembler code for function __bid_nesd2:
0x08049210 <+0>: endbr32
0x08049214 <+4>: push %esi
0x08049215 <+5>: push %ebx
0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx>
0x0804921b <+11>: add $0x8de5,%ebx
0x08049221 <+17>: sub $0x20,%esp
0x08049224 <+20>: mov 0x30(%esp),%esi
0x08049228 <+24>: pushl 0x2c(%esp)
0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64>
0x08049231 <+33>: mov %esi,(%esp)
0x08049234 <+36>: movd %edx,%xmm1
0x08049238 <+40>: movd %eax,%xmm0
0x0804923c <+44>: punpckldq %xmm1,%xmm0
=> 0x08049240 <+48>: movaps %xmm0,0x10(%esp)
0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64>
0x0804924a <+58>: push %edx
0x0804924b <+59>: push %eax
0x0804924c <+60>: pushl 0x1c(%esp)
0x08049250 <+64>: pushl 0x1c(%esp)
0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal>
0x08049259 <+73>: add $0x34,%esp
0x0804925c <+76>: pop %ebx
0x0804925d <+77>: pop %esi
0x0804925e <+78>: ret
when libgcc is compiled with -msse2. According to GCC manual:
'-mpreferred-stack-boundary=NUM'
Attempt to keep the stack boundary aligned to a 2 raised to NUM
byte boundary. If '-mpreferred-stack-boundary' is not specified,
the default is 4 (16 bytes or 128-bits).
*Warning:* If you use this switch, then you must build all modules
with the same value, including any libraries. This includes the
system libraries and startup modules.
c-c++-common/dfp/func-vararg-mixed-2.c, which was added by
commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Wed Jul 30 19:24:02 2008 +0000
func-vararg-alternate-d128-2.c: New.
2008-07-30 H.J. Lu <hongjiu.lu@intel.com>
Joey Ye <joey.ye@intel.com>
* gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.
* gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.
isn't expected to work with libgcc.
gcc/
PR target/95021
* config/i386/i386-features.c (has_non_address_hard_reg):
Renamed to ...
(pseudo_reg_set): This. Return the SET expression. Ignore
pseudo register push.
(general_scalar_to_vector_candidate_p): Combine single_set and
has_non_address_hard_reg calls to pseudo_reg_set.
(timode_scalar_to_vector_candidate_p): Likewise.
* config/i386/i386.md (*pushv1ti2): New pattern.
gcc/testsuite/
PR target/95021
* c-c++-common/dfp/func-vararg-mixed-2.c: Removed.
* gcc.target/i386/pr95021-1.c: New test.
* gcc.target/i386/pr95021-2.c: Likewise.
* gcc.target/i386/pr95021-3.c: Likewise.
* gcc.target/i386/pr95021-4.c: Likewise.
* gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
|
|
|
/* Return the SET expression if INSN doesn't reference hard register.
|
|
|
|
Return NULL if INSN uses or defines a hard register, excluding
|
|
|
|
pseudo register pushes, hard register uses in a memory address,
|
|
|
|
clobbers and flags definitions. */
|
2019-05-06 09:18:26 +02:00
|
|
|
|
x86: Allow V1TI vector register pushes
Add V1TI vector register push and split it after reload to a sequence
of:
(set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))
(set (match_dup 0) (match_dup 1))
so that STV pass can convert TI mode integer push to V1TI vector register
push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls
of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore
pseudo register push.
Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with
-mpreferred-stack-boundary=2 and leads to segfault:
Dump of assembler code for function __bid_nesd2:
0x08049210 <+0>: endbr32
0x08049214 <+4>: push %esi
0x08049215 <+5>: push %ebx
0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx>
0x0804921b <+11>: add $0x8de5,%ebx
0x08049221 <+17>: sub $0x20,%esp
0x08049224 <+20>: mov 0x30(%esp),%esi
0x08049228 <+24>: pushl 0x2c(%esp)
0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64>
0x08049231 <+33>: mov %esi,(%esp)
0x08049234 <+36>: movd %edx,%xmm1
0x08049238 <+40>: movd %eax,%xmm0
0x0804923c <+44>: punpckldq %xmm1,%xmm0
=> 0x08049240 <+48>: movaps %xmm0,0x10(%esp)
0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64>
0x0804924a <+58>: push %edx
0x0804924b <+59>: push %eax
0x0804924c <+60>: pushl 0x1c(%esp)
0x08049250 <+64>: pushl 0x1c(%esp)
0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal>
0x08049259 <+73>: add $0x34,%esp
0x0804925c <+76>: pop %ebx
0x0804925d <+77>: pop %esi
0x0804925e <+78>: ret
when libgcc is compiled with -msse2. According to GCC manual:
'-mpreferred-stack-boundary=NUM'
Attempt to keep the stack boundary aligned to a 2 raised to NUM
byte boundary. If '-mpreferred-stack-boundary' is not specified,
the default is 4 (16 bytes or 128-bits).
*Warning:* If you use this switch, then you must build all modules
with the same value, including any libraries. This includes the
system libraries and startup modules.
c-c++-common/dfp/func-vararg-mixed-2.c, which was added by
commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Wed Jul 30 19:24:02 2008 +0000
func-vararg-alternate-d128-2.c: New.
2008-07-30 H.J. Lu <hongjiu.lu@intel.com>
Joey Ye <joey.ye@intel.com>
* gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.
* gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.
isn't expected to work with libgcc.
gcc/
PR target/95021
* config/i386/i386-features.c (has_non_address_hard_reg):
Renamed to ...
(pseudo_reg_set): This. Return the SET expression. Ignore
pseudo register push.
(general_scalar_to_vector_candidate_p): Combine single_set and
has_non_address_hard_reg calls to pseudo_reg_set.
(timode_scalar_to_vector_candidate_p): Likewise.
* config/i386/i386.md (*pushv1ti2): New pattern.
gcc/testsuite/
PR target/95021
* c-c++-common/dfp/func-vararg-mixed-2.c: Removed.
* gcc.target/i386/pr95021-1.c: New test.
* gcc.target/i386/pr95021-2.c: Likewise.
* gcc.target/i386/pr95021-3.c: Likewise.
* gcc.target/i386/pr95021-4.c: Likewise.
* gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
|
|
|
static rtx
|
|
|
|
pseudo_reg_set (rtx_insn *insn)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
x86: Allow V1TI vector register pushes
Add V1TI vector register push and split it after reload to a sequence
of:
(set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))
(set (match_dup 0) (match_dup 1))
so that STV pass can convert TI mode integer push to V1TI vector register
push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls
of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore
pseudo register push.
Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with
-mpreferred-stack-boundary=2 and leads to segfault:
Dump of assembler code for function __bid_nesd2:
0x08049210 <+0>: endbr32
0x08049214 <+4>: push %esi
0x08049215 <+5>: push %ebx
0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx>
0x0804921b <+11>: add $0x8de5,%ebx
0x08049221 <+17>: sub $0x20,%esp
0x08049224 <+20>: mov 0x30(%esp),%esi
0x08049228 <+24>: pushl 0x2c(%esp)
0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64>
0x08049231 <+33>: mov %esi,(%esp)
0x08049234 <+36>: movd %edx,%xmm1
0x08049238 <+40>: movd %eax,%xmm0
0x0804923c <+44>: punpckldq %xmm1,%xmm0
=> 0x08049240 <+48>: movaps %xmm0,0x10(%esp)
0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64>
0x0804924a <+58>: push %edx
0x0804924b <+59>: push %eax
0x0804924c <+60>: pushl 0x1c(%esp)
0x08049250 <+64>: pushl 0x1c(%esp)
0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal>
0x08049259 <+73>: add $0x34,%esp
0x0804925c <+76>: pop %ebx
0x0804925d <+77>: pop %esi
0x0804925e <+78>: ret
when libgcc is compiled with -msse2. According to GCC manual:
'-mpreferred-stack-boundary=NUM'
Attempt to keep the stack boundary aligned to a 2 raised to NUM
byte boundary. If '-mpreferred-stack-boundary' is not specified,
the default is 4 (16 bytes or 128-bits).
*Warning:* If you use this switch, then you must build all modules
with the same value, including any libraries. This includes the
system libraries and startup modules.
c-c++-common/dfp/func-vararg-mixed-2.c, which was added by
commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Wed Jul 30 19:24:02 2008 +0000
func-vararg-alternate-d128-2.c: New.
2008-07-30 H.J. Lu <hongjiu.lu@intel.com>
Joey Ye <joey.ye@intel.com>
* gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.
* gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.
isn't expected to work with libgcc.
gcc/
PR target/95021
* config/i386/i386-features.c (has_non_address_hard_reg):
Renamed to ...
(pseudo_reg_set): This. Return the SET expression. Ignore
pseudo register push.
(general_scalar_to_vector_candidate_p): Combine single_set and
has_non_address_hard_reg calls to pseudo_reg_set.
(timode_scalar_to_vector_candidate_p): Likewise.
* config/i386/i386.md (*pushv1ti2): New pattern.
gcc/testsuite/
PR target/95021
* c-c++-common/dfp/func-vararg-mixed-2.c: Removed.
* gcc.target/i386/pr95021-1.c: New test.
* gcc.target/i386/pr95021-2.c: Likewise.
* gcc.target/i386/pr95021-3.c: Likewise.
* gcc.target/i386/pr95021-4.c: Likewise.
* gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
|
|
|
rtx set = single_set (insn);
|
|
|
|
if (!set)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* Check pseudo register push first. */
|
2020-12-06 19:43:16 +01:00
|
|
|
machine_mode mode = TARGET_64BIT ? TImode : DImode;
|
x86: Allow V1TI vector register pushes
Add V1TI vector register push and split it after reload to a sequence
of:
(set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))
(set (match_dup 0) (match_dup 1))
so that STV pass can convert TI mode integer push to V1TI vector register
push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls
of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore
pseudo register push.
Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with
-mpreferred-stack-boundary=2 and leads to segfault:
Dump of assembler code for function __bid_nesd2:
0x08049210 <+0>: endbr32
0x08049214 <+4>: push %esi
0x08049215 <+5>: push %ebx
0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx>
0x0804921b <+11>: add $0x8de5,%ebx
0x08049221 <+17>: sub $0x20,%esp
0x08049224 <+20>: mov 0x30(%esp),%esi
0x08049228 <+24>: pushl 0x2c(%esp)
0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64>
0x08049231 <+33>: mov %esi,(%esp)
0x08049234 <+36>: movd %edx,%xmm1
0x08049238 <+40>: movd %eax,%xmm0
0x0804923c <+44>: punpckldq %xmm1,%xmm0
=> 0x08049240 <+48>: movaps %xmm0,0x10(%esp)
0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64>
0x0804924a <+58>: push %edx
0x0804924b <+59>: push %eax
0x0804924c <+60>: pushl 0x1c(%esp)
0x08049250 <+64>: pushl 0x1c(%esp)
0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal>
0x08049259 <+73>: add $0x34,%esp
0x0804925c <+76>: pop %ebx
0x0804925d <+77>: pop %esi
0x0804925e <+78>: ret
when libgcc is compiled with -msse2. According to GCC manual:
'-mpreferred-stack-boundary=NUM'
Attempt to keep the stack boundary aligned to a 2 raised to NUM
byte boundary. If '-mpreferred-stack-boundary' is not specified,
the default is 4 (16 bytes or 128-bits).
*Warning:* If you use this switch, then you must build all modules
with the same value, including any libraries. This includes the
system libraries and startup modules.
c-c++-common/dfp/func-vararg-mixed-2.c, which was added by
commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Wed Jul 30 19:24:02 2008 +0000
func-vararg-alternate-d128-2.c: New.
2008-07-30 H.J. Lu <hongjiu.lu@intel.com>
Joey Ye <joey.ye@intel.com>
* gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.
* gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.
isn't expected to work with libgcc.
gcc/
PR target/95021
* config/i386/i386-features.c (has_non_address_hard_reg):
Renamed to ...
(pseudo_reg_set): This. Return the SET expression. Ignore
pseudo register push.
(general_scalar_to_vector_candidate_p): Combine single_set and
has_non_address_hard_reg calls to pseudo_reg_set.
(timode_scalar_to_vector_candidate_p): Likewise.
* config/i386/i386.md (*pushv1ti2): New pattern.
gcc/testsuite/
PR target/95021
* c-c++-common/dfp/func-vararg-mixed-2.c: Removed.
* gcc.target/i386/pr95021-1.c: New test.
* gcc.target/i386/pr95021-2.c: Likewise.
* gcc.target/i386/pr95021-3.c: Likewise.
* gcc.target/i386/pr95021-4.c: Likewise.
* gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
|
|
|
if (REG_P (SET_SRC (set))
|
|
|
|
&& !HARD_REGISTER_P (SET_SRC (set))
|
2020-12-06 19:43:16 +01:00
|
|
|
&& push_operand (SET_DEST (set), mode))
|
x86: Allow V1TI vector register pushes
Add V1TI vector register push and split it after reload to a sequence
of:
(set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))
(set (match_dup 0) (match_dup 1))
so that STV pass can convert TI mode integer push to V1TI vector register
push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls
of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore
pseudo register push.
Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with
-mpreferred-stack-boundary=2 and leads to segfault:
Dump of assembler code for function __bid_nesd2:
0x08049210 <+0>: endbr32
0x08049214 <+4>: push %esi
0x08049215 <+5>: push %ebx
0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx>
0x0804921b <+11>: add $0x8de5,%ebx
0x08049221 <+17>: sub $0x20,%esp
0x08049224 <+20>: mov 0x30(%esp),%esi
0x08049228 <+24>: pushl 0x2c(%esp)
0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64>
0x08049231 <+33>: mov %esi,(%esp)
0x08049234 <+36>: movd %edx,%xmm1
0x08049238 <+40>: movd %eax,%xmm0
0x0804923c <+44>: punpckldq %xmm1,%xmm0
=> 0x08049240 <+48>: movaps %xmm0,0x10(%esp)
0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64>
0x0804924a <+58>: push %edx
0x0804924b <+59>: push %eax
0x0804924c <+60>: pushl 0x1c(%esp)
0x08049250 <+64>: pushl 0x1c(%esp)
0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal>
0x08049259 <+73>: add $0x34,%esp
0x0804925c <+76>: pop %ebx
0x0804925d <+77>: pop %esi
0x0804925e <+78>: ret
when libgcc is compiled with -msse2. According to GCC manual:
'-mpreferred-stack-boundary=NUM'
Attempt to keep the stack boundary aligned to a 2 raised to NUM
byte boundary. If '-mpreferred-stack-boundary' is not specified,
the default is 4 (16 bytes or 128-bits).
*Warning:* If you use this switch, then you must build all modules
with the same value, including any libraries. This includes the
system libraries and startup modules.
c-c++-common/dfp/func-vararg-mixed-2.c, which was added by
commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Wed Jul 30 19:24:02 2008 +0000
func-vararg-alternate-d128-2.c: New.
2008-07-30 H.J. Lu <hongjiu.lu@intel.com>
Joey Ye <joey.ye@intel.com>
* gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.
* gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.
isn't expected to work with libgcc.
gcc/
PR target/95021
* config/i386/i386-features.c (has_non_address_hard_reg):
Renamed to ...
(pseudo_reg_set): This. Return the SET expression. Ignore
pseudo register push.
(general_scalar_to_vector_candidate_p): Combine single_set and
has_non_address_hard_reg calls to pseudo_reg_set.
(timode_scalar_to_vector_candidate_p): Likewise.
* config/i386/i386.md (*pushv1ti2): New pattern.
gcc/testsuite/
PR target/95021
* c-c++-common/dfp/func-vararg-mixed-2.c: Removed.
* gcc.target/i386/pr95021-1.c: New test.
* gcc.target/i386/pr95021-2.c: Likewise.
* gcc.target/i386/pr95021-3.c: Likewise.
* gcc.target/i386/pr95021-4.c: Likewise.
* gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
|
|
|
return set;
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
df_ref ref;
|
|
|
|
FOR_EACH_INSN_DEF (ref, insn)
|
|
|
|
if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
|
|
|
|
&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
|
|
|
|
&& DF_REF_REGNO (ref) != FLAGS_REG)
|
x86: Allow V1TI vector register pushes
Add V1TI vector register push and split it after reload to a sequence
of:
(set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))
(set (match_dup 0) (match_dup 1))
so that STV pass can convert TI mode integer push to V1TI vector register
push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls
of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore
pseudo register push.
Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with
-mpreferred-stack-boundary=2 and leads to segfault:
Dump of assembler code for function __bid_nesd2:
0x08049210 <+0>: endbr32
0x08049214 <+4>: push %esi
0x08049215 <+5>: push %ebx
0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx>
0x0804921b <+11>: add $0x8de5,%ebx
0x08049221 <+17>: sub $0x20,%esp
0x08049224 <+20>: mov 0x30(%esp),%esi
0x08049228 <+24>: pushl 0x2c(%esp)
0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64>
0x08049231 <+33>: mov %esi,(%esp)
0x08049234 <+36>: movd %edx,%xmm1
0x08049238 <+40>: movd %eax,%xmm0
0x0804923c <+44>: punpckldq %xmm1,%xmm0
=> 0x08049240 <+48>: movaps %xmm0,0x10(%esp)
0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64>
0x0804924a <+58>: push %edx
0x0804924b <+59>: push %eax
0x0804924c <+60>: pushl 0x1c(%esp)
0x08049250 <+64>: pushl 0x1c(%esp)
0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal>
0x08049259 <+73>: add $0x34,%esp
0x0804925c <+76>: pop %ebx
0x0804925d <+77>: pop %esi
0x0804925e <+78>: ret
when libgcc is compiled with -msse2. According to GCC manual:
'-mpreferred-stack-boundary=NUM'
Attempt to keep the stack boundary aligned to a 2 raised to NUM
byte boundary. If '-mpreferred-stack-boundary' is not specified,
the default is 4 (16 bytes or 128-bits).
*Warning:* If you use this switch, then you must build all modules
with the same value, including any libraries. This includes the
system libraries and startup modules.
c-c++-common/dfp/func-vararg-mixed-2.c, which was added by
commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Wed Jul 30 19:24:02 2008 +0000
func-vararg-alternate-d128-2.c: New.
2008-07-30 H.J. Lu <hongjiu.lu@intel.com>
Joey Ye <joey.ye@intel.com>
* gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.
* gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.
isn't expected to work with libgcc.
gcc/
PR target/95021
* config/i386/i386-features.c (has_non_address_hard_reg):
Renamed to ...
(pseudo_reg_set): This. Return the SET expression. Ignore
pseudo register push.
(general_scalar_to_vector_candidate_p): Combine single_set and
has_non_address_hard_reg calls to pseudo_reg_set.
(timode_scalar_to_vector_candidate_p): Likewise.
* config/i386/i386.md (*pushv1ti2): New pattern.
gcc/testsuite/
PR target/95021
* c-c++-common/dfp/func-vararg-mixed-2.c: Removed.
* gcc.target/i386/pr95021-1.c: New test.
* gcc.target/i386/pr95021-2.c: Likewise.
* gcc.target/i386/pr95021-3.c: Likewise.
* gcc.target/i386/pr95021-4.c: Likewise.
* gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
|
|
|
return NULL;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
FOR_EACH_INSN_USE (ref, insn)
|
|
|
|
if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
|
x86: Allow V1TI vector register pushes
Add V1TI vector register push and split it after reload to a sequence
of:
(set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))
(set (match_dup 0) (match_dup 1))
so that STV pass can convert TI mode integer push to V1TI vector register
push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls
of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore
pseudo register push.
Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with
-mpreferred-stack-boundary=2 and leads to segfault:
Dump of assembler code for function __bid_nesd2:
0x08049210 <+0>: endbr32
0x08049214 <+4>: push %esi
0x08049215 <+5>: push %ebx
0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx>
0x0804921b <+11>: add $0x8de5,%ebx
0x08049221 <+17>: sub $0x20,%esp
0x08049224 <+20>: mov 0x30(%esp),%esi
0x08049228 <+24>: pushl 0x2c(%esp)
0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64>
0x08049231 <+33>: mov %esi,(%esp)
0x08049234 <+36>: movd %edx,%xmm1
0x08049238 <+40>: movd %eax,%xmm0
0x0804923c <+44>: punpckldq %xmm1,%xmm0
=> 0x08049240 <+48>: movaps %xmm0,0x10(%esp)
0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64>
0x0804924a <+58>: push %edx
0x0804924b <+59>: push %eax
0x0804924c <+60>: pushl 0x1c(%esp)
0x08049250 <+64>: pushl 0x1c(%esp)
0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal>
0x08049259 <+73>: add $0x34,%esp
0x0804925c <+76>: pop %ebx
0x0804925d <+77>: pop %esi
0x0804925e <+78>: ret
when libgcc is compiled with -msse2. According to GCC manual:
'-mpreferred-stack-boundary=NUM'
Attempt to keep the stack boundary aligned to a 2 raised to NUM
byte boundary. If '-mpreferred-stack-boundary' is not specified,
the default is 4 (16 bytes or 128-bits).
*Warning:* If you use this switch, then you must build all modules
with the same value, including any libraries. This includes the
system libraries and startup modules.
c-c++-common/dfp/func-vararg-mixed-2.c, which was added by
commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Wed Jul 30 19:24:02 2008 +0000
func-vararg-alternate-d128-2.c: New.
2008-07-30 H.J. Lu <hongjiu.lu@intel.com>
Joey Ye <joey.ye@intel.com>
* gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.
* gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.
isn't expected to work with libgcc.
gcc/
PR target/95021
* config/i386/i386-features.c (has_non_address_hard_reg):
Renamed to ...
(pseudo_reg_set): This. Return the SET expression. Ignore
pseudo register push.
(general_scalar_to_vector_candidate_p): Combine single_set and
has_non_address_hard_reg calls to pseudo_reg_set.
(timode_scalar_to_vector_candidate_p): Likewise.
* config/i386/i386.md (*pushv1ti2): New pattern.
gcc/testsuite/
PR target/95021
* c-c++-common/dfp/func-vararg-mixed-2.c: Removed.
* gcc.target/i386/pr95021-1.c: New test.
* gcc.target/i386/pr95021-2.c: Likewise.
* gcc.target/i386/pr95021-3.c: Likewise.
* gcc.target/i386/pr95021-4.c: Likewise.
* gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
|
|
|
return NULL;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
x86: Allow V1TI vector register pushes
Add V1TI vector register push and split it after reload to a sequence
of:
(set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))
(set (match_dup 0) (match_dup 1))
so that STV pass can convert TI mode integer push to V1TI vector register
push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls
of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore
pseudo register push.
Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with
-mpreferred-stack-boundary=2 and leads to segfault:
Dump of assembler code for function __bid_nesd2:
0x08049210 <+0>: endbr32
0x08049214 <+4>: push %esi
0x08049215 <+5>: push %ebx
0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx>
0x0804921b <+11>: add $0x8de5,%ebx
0x08049221 <+17>: sub $0x20,%esp
0x08049224 <+20>: mov 0x30(%esp),%esi
0x08049228 <+24>: pushl 0x2c(%esp)
0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64>
0x08049231 <+33>: mov %esi,(%esp)
0x08049234 <+36>: movd %edx,%xmm1
0x08049238 <+40>: movd %eax,%xmm0
0x0804923c <+44>: punpckldq %xmm1,%xmm0
=> 0x08049240 <+48>: movaps %xmm0,0x10(%esp)
0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64>
0x0804924a <+58>: push %edx
0x0804924b <+59>: push %eax
0x0804924c <+60>: pushl 0x1c(%esp)
0x08049250 <+64>: pushl 0x1c(%esp)
0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal>
0x08049259 <+73>: add $0x34,%esp
0x0804925c <+76>: pop %ebx
0x0804925d <+77>: pop %esi
0x0804925e <+78>: ret
when libgcc is compiled with -msse2. According to GCC manual:
'-mpreferred-stack-boundary=NUM'
Attempt to keep the stack boundary aligned to a 2 raised to NUM
byte boundary. If '-mpreferred-stack-boundary' is not specified,
the default is 4 (16 bytes or 128-bits).
*Warning:* If you use this switch, then you must build all modules
with the same value, including any libraries. This includes the
system libraries and startup modules.
c-c++-common/dfp/func-vararg-mixed-2.c, which was added by
commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Wed Jul 30 19:24:02 2008 +0000
func-vararg-alternate-d128-2.c: New.
2008-07-30 H.J. Lu <hongjiu.lu@intel.com>
Joey Ye <joey.ye@intel.com>
* gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.
* gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.
isn't expected to work with libgcc.
gcc/
PR target/95021
* config/i386/i386-features.c (has_non_address_hard_reg):
Renamed to ...
(pseudo_reg_set): This. Return the SET expression. Ignore
pseudo register push.
(general_scalar_to_vector_candidate_p): Combine single_set and
has_non_address_hard_reg calls to pseudo_reg_set.
(timode_scalar_to_vector_candidate_p): Likewise.
* config/i386/i386.md (*pushv1ti2): New pattern.
gcc/testsuite/
PR target/95021
* c-c++-common/dfp/func-vararg-mixed-2.c: Removed.
* gcc.target/i386/pr95021-1.c: New test.
* gcc.target/i386/pr95021-2.c: Likewise.
* gcc.target/i386/pr95021-3.c: Likewise.
* gcc.target/i386/pr95021-4.c: Likewise.
* gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
|
|
|
return set;
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
PR target/70321: Split double word equality/inequality after STV on x86.
This patch resolves the last piece of PR target/70321 a code quality
(P2 regression) affecting mainline. Currently, for HJ's testcase:
void foo (long long ixi)
{
if (ixi != 14348907)
__builtin_abort ();
}
GCC with -m32 -O2 generates four instructions for the comparison:
movl 16(%esp), %eax
movl 20(%esp), %edx
xorl $14348907, %eax
orl %eax, %edx
but with this patch it now requires only three, making better use of
x86's addressing modes:
movl 16(%esp), %eax
xorl $14348907, %eax
orl 20(%esp), %eax
The solution is to expand "doubleword" equality/inequality expressions
using flag setting COMPARE instructions for the early RTL passes, and
then split them during split1, after STV and before reload.
Hence on x86_64, we now see/allow things like:
(insn 11 8 12 2 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:TI 84 [ x ])
(reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword}
This allows the STV pass to decide whether it's preferrable to perform
this comparison using vector operations, i.e. a pxor/ptest sequence,
or as scalar integer operations, i.e. a xor/xor/or sequence. Alas
this required tweaking of the STV pass to recognize the "new" form of
these comparisons and split out the pxor operation itself. To confirm
this still works as expected I've added a new STV test case:
long long a[1024];
long long b[1024];
int foo()
{
for (int i=0; i<1024; i++)
{
long long t = (a[i]<<8) | (b[i]<<24);
if (t == 0)
return 1;
}
return 0;
}
where with -m32 -O2 -msse4.1 the above comparison with zero should look
like:
punpcklqdq %xmm0, %xmm0
ptest %xmm0, %xmm0
Although this patch includes one or two minor tweaks to provide all the
necessary infrastructure to support conversion of TImode comparisons to
V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement
these transformations, but this is something that can be considered after
stage 4. Indeed the new convert_compare functionality is split out
into a method to simplify its potential reuse by the timode_scalar_chain
class.
2022-05-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/70321
* config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose
DI mode equality/inequality using XOR here. Instead generate a
COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode).
* config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use
gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode.
(general_scalar_chain::convert_compare): New function to convert
scalar equality/inequality comparison into vector operations.
(general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call
new convert_compare helper method.
(convertible_comparion_p): Update to match doubleword COMPARE
of two register, memory or integer constant operands.
* config/i386/i386-features.h (general_scalar_chain::convert_compare):
Prototype/declare member function here.
* config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but
only allow new doubleword modes for EQ and NE operators.
(*cmp<dwi>_doubleword): New define_insn_and_split, to split a
doubleword comparison into a pair of XORs followed by an IOR to
set the (zero) flags register, optimizing the XORs if possible.
* config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode
iterator; V_AVX is (currently) only used by ptest.
(sse4_1 mode attribute): Update to support V1TI and V2TI.
gcc/testsuite/ChangeLog
PR target/70321
* gcc.target/i386/pr70321.c: New test case.
* gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
|
|
|
/* Check if comparison INSN may be transformed into vector comparison.
|
|
|
|
Currently we transform equality/inequality checks which look like:
|
|
|
|
(set (reg:CCZ 17 flags) (compare:CCZ (reg:TI x) (reg:TI y))) */
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
static bool
|
2019-08-15 13:09:38 +02:00
|
|
|
convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
PR target/70321: Split double word equality/inequality after STV on x86.
This patch resolves the last piece of PR target/70321 a code quality
(P2 regression) affecting mainline. Currently, for HJ's testcase:
void foo (long long ixi)
{
if (ixi != 14348907)
__builtin_abort ();
}
GCC with -m32 -O2 generates four instructions for the comparison:
movl 16(%esp), %eax
movl 20(%esp), %edx
xorl $14348907, %eax
orl %eax, %edx
but with this patch it now requires only three, making better use of
x86's addressing modes:
movl 16(%esp), %eax
xorl $14348907, %eax
orl 20(%esp), %eax
The solution is to expand "doubleword" equality/inequality expressions
using flag setting COMPARE instructions for the early RTL passes, and
then split them during split1, after STV and before reload.
Hence on x86_64, we now see/allow things like:
(insn 11 8 12 2 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:TI 84 [ x ])
(reg:TI 96))) "cmpti.c":2:43 30 {*cmpti_doubleword}
This allows the STV pass to decide whether it's preferrable to perform
this comparison using vector operations, i.e. a pxor/ptest sequence,
or as scalar integer operations, i.e. a xor/xor/or sequence. Alas
this required tweaking of the STV pass to recognize the "new" form of
these comparisons and split out the pxor operation itself. To confirm
this still works as expected I've added a new STV test case:
long long a[1024];
long long b[1024];
int foo()
{
for (int i=0; i<1024; i++)
{
long long t = (a[i]<<8) | (b[i]<<24);
if (t == 0)
return 1;
}
return 0;
}
where with -m32 -O2 -msse4.1 the above comparison with zero should look
like:
punpcklqdq %xmm0, %xmm0
ptest %xmm0, %xmm0
Although this patch includes one or two minor tweaks to provide all the
necessary infrastructure to support conversion of TImode comparisons to
V1TImode (and SImode comparisons to V4SImode), STV doesn't yet implement
these transformations, but this is something that can be considered after
stage 4. Indeed the new convert_compare functionality is split out
into a method to simplify its potential reuse by the timode_scalar_chain
class.
2022-05-30 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/70321
* config/i386/i386-expand.cc (ix86_expand_branch): Don't decompose
DI mode equality/inequality using XOR here. Instead generate a
COMPARE for doubleword modes (DImode on !TARGET_64BIT or TImode).
* config/i386/i386-features.cc (gen_gpr_to_xmm_move_src): Use
gen_rtx_SUBREG when NUNITS is 1, i.e. for TImode to V1TImode.
(general_scalar_chain::convert_compare): New function to convert
scalar equality/inequality comparison into vector operations.
(general_scalar_chain::convert_insn) [COMPARE]: Refactor. Call
new convert_compare helper method.
(convertible_comparion_p): Update to match doubleword COMPARE
of two register, memory or integer constant operands.
* config/i386/i386-features.h (general_scalar_chain::convert_compare):
Prototype/declare member function here.
* config/i386/i386.md (cstore<mode>4): Change mode to SDWIM, but
only allow new doubleword modes for EQ and NE operators.
(*cmp<dwi>_doubleword): New define_insn_and_split, to split a
doubleword comparison into a pair of XORs followed by an IOR to
set the (zero) flags register, optimizing the XORs if possible.
* config/i386/sse.md (V_AVX): Include V1TI and V2TI in mode
iterator; V_AVX is (currently) only used by ptest.
(sse4_1 mode attribute): Update to support V1TI and V2TI.
gcc/testsuite/ChangeLog
PR target/70321
* gcc.target/i386/pr70321.c: New test case.
* gcc.target/i386/sse4_1-stv-1.c: New test case.
2022-05-30 22:20:09 +02:00
|
|
|
if (mode != (TARGET_64BIT ? TImode : DImode))
|
2019-08-15 12:55:52 +02:00
|
|
|
return false;
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
if (!TARGET_SSE4_1)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
rtx def_set = single_set (insn);
|
|
|
|
|
|
|
|
gcc_assert (def_set);
|
|
|
|
|
|
|
|
rtx src = SET_SRC (def_set);
|
|
|
|
rtx dst = SET_DEST (def_set);
|
|
|
|
|
|
|
|
gcc_assert (GET_CODE (src) == COMPARE);
|
|
|
|
|
|
|
|
if (GET_CODE (dst) != REG
|
|
|
|
|| REGNO (dst) != FLAGS_REG
|
|
|
|
|| GET_MODE (dst) != CCZmode)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
rtx op1 = XEXP (src, 0);
|
|
|
|
rtx op2 = XEXP (src, 1);
|
|
|
|
|
2022-07-09 10:02:14 +02:00
|
|
|
/* *cmp<dwi>_doubleword. */
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
if ((CONST_SCALAR_INT_P (op1)
|
2022-07-09 10:02:14 +02:00
|
|
|
|| ((REG_P (op1) || MEM_P (op1))
|
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64.
This patch resolves PR target/106278 a regression on x86_64 caused by my
recent TImode STV improvements. Now that TImode STV can handle comparisons
such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method
sensibly checks that the mode of the SET_DEST is TImode before setting
it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS.
Hence the current code looks like:
if (GET_MODE (dst) == TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
break;
which checks GET_MODE (dst) before calling PUT_MODE, and when a
change is made updating the REG_EQUAL_NOTE tmp if it exists.
The logical flaw (oversight) is that due to RTL sharing, the destination
of this set may already have been updated to V1TImode, as this chain is
being converted, but we still need to update any REG_EQUAL_NOTE that
still has TImode. Hence the correct code is actually:
if (GET_MODE (dst) == TImode)
{
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
if (GET_MODE (dst) == V1TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
}
break;
While fixing this behavior, I noticed I had some indentation whitespace
issues and some vestigial dead code in this function/method that I've
taken the liberty of cleaning up (as obvious) in this patch.
2022-07-15 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/106278
* config/i386/i386-features.cc (general_scalar_chain::convert_insn):
Fix indentation whitespace.
(timode_scalar_chain::fix_debug_reg_uses): Likewise.
(timode_scalar_chain::convert_insn): Delete dead code.
Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI.
Fix indentation whitespace.
(convertible_comparison_p): Likewise.
(timode_scalar_to_vector_candidate_p): Likewise.
gcc/testsuite/ChangeLog
* gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
|
|
|
&& GET_MODE (op1) == mode))
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
&& (CONST_SCALAR_INT_P (op2)
|
2022-07-09 10:02:14 +02:00
|
|
|
|| ((REG_P (op2) || MEM_P (op2))
|
|
|
|
&& GET_MODE (op2) == mode)))
|
|
|
|
return true;
|
|
|
|
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
/* *testti_doubleword. */
|
|
|
|
if (op2 == const0_rtx
|
|
|
|
&& GET_CODE (op1) == AND
|
|
|
|
&& REG_P (XEXP (op1, 0)))
|
|
|
|
{
|
|
|
|
rtx op12 = XEXP (op1, 1);
|
|
|
|
return GET_MODE (XEXP (op1, 0)) == TImode
|
|
|
|
&& (CONST_SCALAR_INT_P (op12)
|
|
|
|
|| ((REG_P (op12) || MEM_P (op12))
|
|
|
|
&& GET_MODE (op12) == TImode));
|
|
|
|
}
|
|
|
|
|
2022-07-09 10:02:14 +02:00
|
|
|
/* *test<dwi>_not_doubleword. */
|
|
|
|
if (op2 == const0_rtx
|
|
|
|
&& GET_CODE (op1) == AND
|
|
|
|
&& GET_CODE (XEXP (op1, 0)) == NOT)
|
|
|
|
{
|
|
|
|
rtx op11 = XEXP (XEXP (op1, 0), 0);
|
|
|
|
rtx op12 = XEXP (op1, 1);
|
|
|
|
return (REG_P (op11) || MEM_P (op11))
|
|
|
|
&& (REG_P (op12) || MEM_P (op12))
|
|
|
|
&& GET_MODE (op11) == mode
|
|
|
|
&& GET_MODE (op12) == mode;
|
|
|
|
}
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2022-07-09 10:02:14 +02:00
|
|
|
return false;
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
2019-08-15 12:55:52 +02:00
|
|
|
/* The general version of scalar_to_vector_candidate_p. */
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
static bool
|
2019-08-14 14:04:05 +02:00
|
|
|
general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
x86: Allow V1TI vector register pushes
Add V1TI vector register push and split it after reload to a sequence
of:
(set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))
(set (match_dup 0) (match_dup 1))
so that STV pass can convert TI mode integer push to V1TI vector register
push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls
of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore
pseudo register push.
Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with
-mpreferred-stack-boundary=2 and leads to segfault:
Dump of assembler code for function __bid_nesd2:
0x08049210 <+0>: endbr32
0x08049214 <+4>: push %esi
0x08049215 <+5>: push %ebx
0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx>
0x0804921b <+11>: add $0x8de5,%ebx
0x08049221 <+17>: sub $0x20,%esp
0x08049224 <+20>: mov 0x30(%esp),%esi
0x08049228 <+24>: pushl 0x2c(%esp)
0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64>
0x08049231 <+33>: mov %esi,(%esp)
0x08049234 <+36>: movd %edx,%xmm1
0x08049238 <+40>: movd %eax,%xmm0
0x0804923c <+44>: punpckldq %xmm1,%xmm0
=> 0x08049240 <+48>: movaps %xmm0,0x10(%esp)
0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64>
0x0804924a <+58>: push %edx
0x0804924b <+59>: push %eax
0x0804924c <+60>: pushl 0x1c(%esp)
0x08049250 <+64>: pushl 0x1c(%esp)
0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal>
0x08049259 <+73>: add $0x34,%esp
0x0804925c <+76>: pop %ebx
0x0804925d <+77>: pop %esi
0x0804925e <+78>: ret
when libgcc is compiled with -msse2. According to GCC manual:
'-mpreferred-stack-boundary=NUM'
Attempt to keep the stack boundary aligned to a 2 raised to NUM
byte boundary. If '-mpreferred-stack-boundary' is not specified,
the default is 4 (16 bytes or 128-bits).
*Warning:* If you use this switch, then you must build all modules
with the same value, including any libraries. This includes the
system libraries and startup modules.
c-c++-common/dfp/func-vararg-mixed-2.c, which was added by
commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Wed Jul 30 19:24:02 2008 +0000
func-vararg-alternate-d128-2.c: New.
2008-07-30 H.J. Lu <hongjiu.lu@intel.com>
Joey Ye <joey.ye@intel.com>
* gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.
* gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.
isn't expected to work with libgcc.
gcc/
PR target/95021
* config/i386/i386-features.c (has_non_address_hard_reg):
Renamed to ...
(pseudo_reg_set): This. Return the SET expression. Ignore
pseudo register push.
(general_scalar_to_vector_candidate_p): Combine single_set and
has_non_address_hard_reg calls to pseudo_reg_set.
(timode_scalar_to_vector_candidate_p): Likewise.
* config/i386/i386.md (*pushv1ti2): New pattern.
gcc/testsuite/
PR target/95021
* c-c++-common/dfp/func-vararg-mixed-2.c: Removed.
* gcc.target/i386/pr95021-1.c: New test.
* gcc.target/i386/pr95021-2.c: Likewise.
* gcc.target/i386/pr95021-3.c: Likewise.
* gcc.target/i386/pr95021-4.c: Likewise.
* gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
|
|
|
rtx def_set = pseudo_reg_set (insn);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
if (!def_set)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
rtx src = SET_SRC (def_set);
|
|
|
|
rtx dst = SET_DEST (def_set);
|
|
|
|
|
|
|
|
if (GET_CODE (src) == COMPARE)
|
2019-08-14 14:04:05 +02:00
|
|
|
return convertible_comparison_p (insn, mode);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-08-15 12:55:52 +02:00
|
|
|
/* We are interested in "mode" only. */
|
2019-08-14 14:04:05 +02:00
|
|
|
if ((GET_MODE (src) != mode
|
2019-05-06 09:18:26 +02:00
|
|
|
&& !CONST_INT_P (src))
|
2019-08-14 14:04:05 +02:00
|
|
|
|| GET_MODE (dst) != mode)
|
2019-05-06 09:18:26 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!REG_P (dst) && !MEM_P (dst))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
switch (GET_CODE (src))
|
|
|
|
{
|
|
|
|
case ASHIFTRT:
|
|
|
|
if (!TARGET_AVX512VL)
|
|
|
|
return false;
|
|
|
|
/* FALLTHRU */
|
|
|
|
|
|
|
|
case ASHIFT:
|
|
|
|
case LSHIFTRT:
|
|
|
|
if (!CONST_INT_P (XEXP (src, 1))
|
2019-08-29 21:47:19 +02:00
|
|
|
|| !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
|
2019-05-06 09:18:26 +02:00
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
case SMAX:
|
|
|
|
case SMIN:
|
|
|
|
case UMAX:
|
|
|
|
case UMIN:
|
|
|
|
if ((mode == DImode && !TARGET_AVX512VL)
|
|
|
|
|| (mode == SImode && !TARGET_SSE4_1))
|
|
|
|
return false;
|
|
|
|
/* Fallthru. */
|
|
|
|
|
2021-07-01 10:56:32 +02:00
|
|
|
case AND:
|
2019-05-06 09:18:26 +02:00
|
|
|
case IOR:
|
|
|
|
case XOR:
|
2021-07-01 10:56:32 +02:00
|
|
|
case PLUS:
|
|
|
|
case MINUS:
|
2019-05-06 09:18:26 +02:00
|
|
|
if (!REG_P (XEXP (src, 1))
|
|
|
|
&& !MEM_P (XEXP (src, 1))
|
|
|
|
&& !CONST_INT_P (XEXP (src, 1)))
|
|
|
|
return false;
|
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
if (GET_MODE (XEXP (src, 1)) != mode
|
2019-05-06 09:18:26 +02:00
|
|
|
&& !CONST_INT_P (XEXP (src, 1)))
|
|
|
|
return false;
|
2021-07-01 10:56:32 +02:00
|
|
|
|
|
|
|
/* Check for andnot case. */
|
|
|
|
if (GET_CODE (src) != AND
|
|
|
|
|| GET_CODE (XEXP (src, 0)) != NOT)
|
|
|
|
break;
|
|
|
|
|
|
|
|
src = XEXP (src, 0);
|
|
|
|
/* FALLTHRU */
|
|
|
|
|
|
|
|
case NOT:
|
i386: Optimize abs expansion [PR97873]
The patch introduces absM named pattern to generate optimal insn sequence
for CMOVE_TARGET targets. Currently, the expansion goes through neg+max
optabs, and the following code is generated:
movl %edi, %eax
negl %eax
cmpl %edi, %eax
cmovl %edi, %eax
This sequence is unoptimal in two ways. a) The compare instruction is
not needed, since NEG insn sets the sign flag based on the result.
The CMOV can use sign flag to select between negated and original value:
movl %edi, %eax
negl %eax
cmovs %edi, %eax
b) On some targets, CMOV is undesirable due to its performance issues.
In addition to TARGET_EXPAND_ABS bypass, the patch introduces STV
conversion of abs RTX to use PABS SSE insn:
vmovd %edi, %xmm0
vpabsd %xmm0, %xmm0
vmovd %xmm0, %eax
The patch changes compare mode of NEG instruction to CCGOCmode,
which is the same mode as the mode of SUB instruction. IOW, sign bit
becomes usable.
Also, the mode iterator of <maxmin:code><mode>3 pattern is changed
to SWI48x instead of SWI248. The purpose of maxmin expander is to
prepare max/min RTX for STV to eventually convert them to SSE PMAX/PMIN
instructions, in order to *avoid* CMOV insns with general registers.
2020-11-20 Uroš Bizjak <ubizjak@gmail.com>
gcc/
PR target/97873
* config/i386/i386.md (*neg<mode>2_2): Rename from
"*neg<mode>2_cmpz". Use CCGOCmode instead of CCZmode.
(*negsi2_zext): Rename from *negsi2_cmpz_zext.
Use CCGOCmode instead of CCZmode.
(*neg<mode>_ccc_1): New insn pattern.
(*neg<dwi>2_doubleword): Use *neg<mode>_ccc_1.
(abs<mode>2): Add FLAGS_REG clobber.
Use TARGET_CMOVE insn predicate.
(*abs<mode>2_1): New insn_and_split pattern.
(*absdi2_doubleword): Ditto.
(<maxmin:code><mode>3): Use SWI48x mode iterator.
(*<maxmin:code><mode>3): Use SWI48 mode iterator.
* config/i386/i386-features.c
(general_scalar_chain::compute_convert_gain): Handle ABS code.
(general_scalar_chain::convert_insn): Ditto.
(general_scalar_to_vector_candidate_p): Ditto.
gcc/testsuite/
PR target/97873
* gcc.target/i386/pr97873.c: New test.
* gcc.target/i386/pr97873-1.c: New test.
2020-11-20 10:26:34 +01:00
|
|
|
break;
|
|
|
|
|
2021-07-01 10:56:32 +02:00
|
|
|
case NEG:
|
|
|
|
/* Check for nabs case. */
|
|
|
|
if (GET_CODE (XEXP (src, 0)) != ABS)
|
|
|
|
break;
|
|
|
|
|
|
|
|
src = XEXP (src, 0);
|
|
|
|
/* FALLTHRU */
|
|
|
|
|
i386: Optimize abs expansion [PR97873]
The patch introduces absM named pattern to generate optimal insn sequence
for CMOVE_TARGET targets. Currently, the expansion goes through neg+max
optabs, and the following code is generated:
movl %edi, %eax
negl %eax
cmpl %edi, %eax
cmovl %edi, %eax
This sequence is unoptimal in two ways. a) The compare instruction is
not needed, since NEG insn sets the sign flag based on the result.
The CMOV can use sign flag to select between negated and original value:
movl %edi, %eax
negl %eax
cmovs %edi, %eax
b) On some targets, CMOV is undesirable due to its performance issues.
In addition to TARGET_EXPAND_ABS bypass, the patch introduces STV
conversion of abs RTX to use PABS SSE insn:
vmovd %edi, %xmm0
vpabsd %xmm0, %xmm0
vmovd %xmm0, %eax
The patch changes compare mode of NEG instruction to CCGOCmode,
which is the same mode as the mode of SUB instruction. IOW, sign bit
becomes usable.
Also, the mode iterator of <maxmin:code><mode>3 pattern is changed
to SWI48x instead of SWI248. The purpose of maxmin expander is to
prepare max/min RTX for STV to eventually convert them to SSE PMAX/PMIN
instructions, in order to *avoid* CMOV insns with general registers.
2020-11-20 Uroš Bizjak <ubizjak@gmail.com>
gcc/
PR target/97873
* config/i386/i386.md (*neg<mode>2_2): Rename from
"*neg<mode>2_cmpz". Use CCGOCmode instead of CCZmode.
(*negsi2_zext): Rename from *negsi2_cmpz_zext.
Use CCGOCmode instead of CCZmode.
(*neg<mode>_ccc_1): New insn pattern.
(*neg<dwi>2_doubleword): Use *neg<mode>_ccc_1.
(abs<mode>2): Add FLAGS_REG clobber.
Use TARGET_CMOVE insn predicate.
(*abs<mode>2_1): New insn_and_split pattern.
(*absdi2_doubleword): Ditto.
(<maxmin:code><mode>3): Use SWI48x mode iterator.
(*<maxmin:code><mode>3): Use SWI48 mode iterator.
* config/i386/i386-features.c
(general_scalar_chain::compute_convert_gain): Handle ABS code.
(general_scalar_chain::convert_insn): Ditto.
(general_scalar_to_vector_candidate_p): Ditto.
gcc/testsuite/
PR target/97873
* gcc.target/i386/pr97873.c: New test.
* gcc.target/i386/pr97873-1.c: New test.
2020-11-20 10:26:34 +01:00
|
|
|
case ABS:
|
|
|
|
if ((mode == DImode && !TARGET_AVX512VL)
|
|
|
|
|| (mode == SImode && !TARGET_SSSE3))
|
|
|
|
return false;
|
2019-05-06 09:18:26 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
case REG:
|
|
|
|
return true;
|
|
|
|
|
|
|
|
case MEM:
|
|
|
|
case CONST_INT:
|
|
|
|
return REG_P (dst);
|
|
|
|
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!REG_P (XEXP (src, 0))
|
|
|
|
&& !MEM_P (XEXP (src, 0))
|
2021-07-01 10:56:32 +02:00
|
|
|
&& !CONST_INT_P (XEXP (src, 0)))
|
|
|
|
return false;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
if (GET_MODE (XEXP (src, 0)) != mode
|
2019-05-06 09:18:26 +02:00
|
|
|
&& !CONST_INT_P (XEXP (src, 0)))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
/* Check for a suitable TImode memory operand. */
|
|
|
|
|
|
|
|
static bool
|
|
|
|
timode_mem_p (rtx x)
|
|
|
|
{
|
|
|
|
return MEM_P (x)
|
|
|
|
&& (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
|
|
|
|
|| !misaligned_operand (x, TImode));
|
|
|
|
}
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
/* The TImode version of scalar_to_vector_candidate_p. */
|
|
|
|
|
|
|
|
static bool
|
|
|
|
timode_scalar_to_vector_candidate_p (rtx_insn *insn)
|
|
|
|
{
|
x86: Allow V1TI vector register pushes
Add V1TI vector register push and split it after reload to a sequence
of:
(set (reg:P SP_REG) (plus:P SP_REG) (const_int -8)))
(set (match_dup 0) (match_dup 1))
so that STV pass can convert TI mode integer push to V1TI vector register
push. Rename has_non_address_hard_reg to pseudo_reg_set, combine calls
of single_set and has_non_address_hard_reg to pseudo_reg_set, to ignore
pseudo register push.
Remove c-c++-common/dfp/func-vararg-mixed-2.c since it is compiled with
-mpreferred-stack-boundary=2 and leads to segfault:
Dump of assembler code for function __bid_nesd2:
0x08049210 <+0>: endbr32
0x08049214 <+4>: push %esi
0x08049215 <+5>: push %ebx
0x08049216 <+6>: call 0x8049130 <__x86.get_pc_thunk.bx>
0x0804921b <+11>: add $0x8de5,%ebx
0x08049221 <+17>: sub $0x20,%esp
0x08049224 <+20>: mov 0x30(%esp),%esi
0x08049228 <+24>: pushl 0x2c(%esp)
0x0804922c <+28>: call 0x804e600 <__bid32_to_bid64>
0x08049231 <+33>: mov %esi,(%esp)
0x08049234 <+36>: movd %edx,%xmm1
0x08049238 <+40>: movd %eax,%xmm0
0x0804923c <+44>: punpckldq %xmm1,%xmm0
=> 0x08049240 <+48>: movaps %xmm0,0x10(%esp)
0x08049245 <+53>: call 0x804e600 <__bid32_to_bid64>
0x0804924a <+58>: push %edx
0x0804924b <+59>: push %eax
0x0804924c <+60>: pushl 0x1c(%esp)
0x08049250 <+64>: pushl 0x1c(%esp)
0x08049254 <+68>: call 0x804b260 <__bid64_quiet_not_equal>
0x08049259 <+73>: add $0x34,%esp
0x0804925c <+76>: pop %ebx
0x0804925d <+77>: pop %esi
0x0804925e <+78>: ret
when libgcc is compiled with -msse2. According to GCC manual:
'-mpreferred-stack-boundary=NUM'
Attempt to keep the stack boundary aligned to a 2 raised to NUM
byte boundary. If '-mpreferred-stack-boundary' is not specified,
the default is 4 (16 bytes or 128-bits).
*Warning:* If you use this switch, then you must build all modules
with the same value, including any libraries. This includes the
system libraries and startup modules.
c-c++-common/dfp/func-vararg-mixed-2.c, which was added by
commit 3b2488ca6ece182f2136a20ee5fa0bb92f935b0f
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Wed Jul 30 19:24:02 2008 +0000
func-vararg-alternate-d128-2.c: New.
2008-07-30 H.J. Lu <hongjiu.lu@intel.com>
Joey Ye <joey.ye@intel.com>
* gcc.dg/dfp/func-vararg-alternate-d128-2.c: New.
* gcc.dg/dfp/func-vararg-mixed-2.c: Likewise.
isn't expected to work with libgcc.
gcc/
PR target/95021
* config/i386/i386-features.c (has_non_address_hard_reg):
Renamed to ...
(pseudo_reg_set): This. Return the SET expression. Ignore
pseudo register push.
(general_scalar_to_vector_candidate_p): Combine single_set and
has_non_address_hard_reg calls to pseudo_reg_set.
(timode_scalar_to_vector_candidate_p): Likewise.
* config/i386/i386.md (*pushv1ti2): New pattern.
gcc/testsuite/
PR target/95021
* c-c++-common/dfp/func-vararg-mixed-2.c: Removed.
* gcc.target/i386/pr95021-1.c: New test.
* gcc.target/i386/pr95021-2.c: Likewise.
* gcc.target/i386/pr95021-3.c: Likewise.
* gcc.target/i386/pr95021-4.c: Likewise.
* gcc.target/i386/pr95021-5.c: Likewise.
2020-05-17 19:10:34 +02:00
|
|
|
rtx def_set = pseudo_reg_set (insn);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
if (!def_set)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
rtx src = SET_SRC (def_set);
|
|
|
|
rtx dst = SET_DEST (def_set);
|
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
if (GET_CODE (src) == COMPARE)
|
|
|
|
return convertible_comparison_p (insn, TImode);
|
|
|
|
|
|
|
|
if (GET_MODE (dst) != TImode
|
|
|
|
|| (GET_MODE (src) != TImode
|
PR target/106278: Keep REG_EQUAL notes consistent during TImode STV on x86_64.
This patch resolves PR target/106278 a regression on x86_64 caused by my
recent TImode STV improvements. Now that TImode STV can handle comparisons
such as "(set (regs:CC) (compare:CC (reg:TI) ...))" the convert_insn method
sensibly checks that the mode of the SET_DEST is TImode before setting
it to V1TImode [to avoid V1TImode appearing on the hard reg CC_FLAGS.
Hence the current code looks like:
if (GET_MODE (dst) == TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
break;
which checks GET_MODE (dst) before calling PUT_MODE, and when a
change is made updating the REG_EQUAL_NOTE tmp if it exists.
The logical flaw (oversight) is that due to RTL sharing, the destination
of this set may already have been updated to V1TImode, as this chain is
being converted, but we still need to update any REG_EQUAL_NOTE that
still has TImode. Hence the correct code is actually:
if (GET_MODE (dst) == TImode)
{
PUT_MODE (dst, V1TImode);
fix_debug_reg_uses (dst);
}
if (GET_MODE (dst) == V1TImode)
{
tmp = find_reg_equal_equiv_note (insn);
if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode)
PUT_MODE (XEXP (tmp, 0), V1TImode);
}
break;
While fixing this behavior, I noticed I had some indentation whitespace
issues and some vestigial dead code in this function/method that I've
taken the liberty of cleaning up (as obvious) in this patch.
2022-07-15 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR target/106278
* config/i386/i386-features.cc (general_scalar_chain::convert_insn):
Fix indentation whitespace.
(timode_scalar_chain::fix_debug_reg_uses): Likewise.
(timode_scalar_chain::convert_insn): Delete dead code.
Update TImode REG_EQUAL_NOTE even if the SET_DEST is already V1TI.
Fix indentation whitespace.
(convertible_comparison_p): Likewise.
(timode_scalar_to_vector_candidate_p): Likewise.
gcc/testsuite/ChangeLog
* gcc.dg/pr106278.c: New test case.
2022-07-15 15:39:28 +02:00
|
|
|
&& !CONST_SCALAR_INT_P (src)))
|
2019-05-06 09:18:26 +02:00
|
|
|
return false;
|
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
if (!REG_P (dst) && !MEM_P (dst))
|
|
|
|
return false;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
if (MEM_P (dst)
|
|
|
|
&& misaligned_operand (dst, TImode)
|
|
|
|
&& !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
|
|
|
|
return false;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
switch (GET_CODE (src))
|
|
|
|
{
|
|
|
|
case REG:
|
|
|
|
case CONST_WIDE_INT:
|
|
|
|
return true;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
case CONST_INT:
|
|
|
|
/* ??? Verify performance impact before enabling CONST_INT for
|
|
|
|
__int128 store. */
|
|
|
|
return standard_sse_constant_p (src, TImode);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
case MEM:
|
|
|
|
/* Memory must be aligned or unaligned load is optimal. */
|
2019-05-06 09:18:26 +02:00
|
|
|
return (REG_P (dst)
|
|
|
|
&& (!misaligned_operand (src, TImode)
|
|
|
|
|| TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
|
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
case AND:
|
|
|
|
if (!MEM_P (dst)
|
|
|
|
&& GET_CODE (XEXP (src, 0)) == NOT
|
|
|
|
&& REG_P (XEXP (XEXP (src, 0), 0))
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
&& (REG_P (XEXP (src, 1))
|
|
|
|
|| CONST_SCALAR_INT_P (XEXP (src, 1))
|
|
|
|
|| timode_mem_p (XEXP (src, 1))))
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
return true;
|
|
|
|
return REG_P (XEXP (src, 0))
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
&& (REG_P (XEXP (src, 1))
|
|
|
|
|| CONST_SCALAR_INT_P (XEXP (src, 1))
|
|
|
|
|| timode_mem_p (XEXP (src, 1)));
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
|
|
|
|
case IOR:
|
|
|
|
case XOR:
|
|
|
|
return REG_P (XEXP (src, 0))
|
Use PTEST to perform AND in TImode STV of (A & B) != 0 on x86_64.
This x86_64 backend patch allows TImode STV to take advantage of the
fact that the PTEST instruction performs an AND operation. Previously
PTEST was (mostly) used for comparison against zero, by using the same
operands. The benefits are demonstrated by the new test case:
__int128 a,b;
int foo()
{
return (a & b) != 0;
}
Currently with -O2 -msse4 we generate:
movdqa a(%rip), %xmm0
pand b(%rip), %xmm0
xorl %eax, %eax
ptest %xmm0, %xmm0
setne %al
ret
with this patch we now generate:
movdqa a(%rip), %xmm0
xorl %eax, %eax
ptest b(%rip), %xmm0
setne %al
ret
Technically, the magic happens using new define_insn_and_split patterns.
Using two patterns allows this transformation to performed independently
of whether TImode STV is run before or after combine. The one tricky
case is that immediate constant operands of the AND behave slightly
differently between TImode and V1TImode: All V1TImode immediate operands
becomes loads, but for TImode only values that are not hilo_operands
need to be loaded. Hence the new *testti_doubleword accepts any
general_operand, but internally during split calls force_reg whenever
the second operand is not x86_64_hilo_general_operand. This required
(benefits from) some tweaks to TImode STV to support CONST_WIDE_INT in
more places, using CONST_SCALAR_INT_P instead of just CONST_INT_P.
2022-08-09 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (scalar_chain::convert_compare):
Create new pseudos only when/if needed. Add support for TEST,
i.e. (COMPARE (AND x y) (const_int 0)), using UNSPEC_PTEST.
When broadcasting V2DImode and V4SImode use new pseudo register.
(timode_scalar_chain::convert_op): Do nothing if operand is
already V1TImode. Avoid generating useless SUBREG conversions,
i.e. (SUBREG:V1TImode (REG:V1TImode) 0). Handle CONST_WIDE_INT
in addition to CONST_INT by using CONST_SCALAR_INT_P.
(convertible_comparison_p): Use CONST_SCALAR_INT_P to match both
CONST_WIDE_INT and CONST_INT. Recognize new *testti_doubleword
pattern as an STV candidate.
(timode_scalar_to_vector_candidate_p): Allow CONST_SCALAR_INT_P
operands in binary logic operations.
* config/i386/i386.cc (ix86_rtx_costs) <case UNSPEC>: Add costs
for UNSPEC_PTEST; a PTEST that performs an AND has the same cost
as regular PTEST, i.e. cost->sse_op.
* config/i386/i386.md (*testti_doubleword): New pre-reload
define_insn_and_split that recognizes comparison of TI mode AND
against zero.
* config/i386/sse.md (*ptest<mode>_and): New pre-reload
define_insn_and_split that recognizes UNSPEC_PTEST of identical
AND operands.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-8.c: New test case.
2022-08-09 19:59:55 +02:00
|
|
|
&& (REG_P (XEXP (src, 1))
|
|
|
|
|| CONST_SCALAR_INT_P (XEXP (src, 1))
|
|
|
|
|| timode_mem_p (XEXP (src, 1)));
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
|
|
|
|
case NOT:
|
|
|
|
return REG_P (XEXP (src, 0)) || timode_mem_p (XEXP (src, 0));
|
|
|
|
|
Support logical shifts by (some) integer constants in TImode STV on x86_64.
This patch improves TImode STV by adding support for logical shifts by
integer constants that are multiples of 8. For the test case:
unsigned __int128 a, b;
void foo() { a = b << 16; }
on x86_64, gcc -O2 currently generates:
movq b(%rip), %rax
movq b+8(%rip), %rdx
shldq $16, %rax, %rdx
salq $16, %rax
movq %rax, a(%rip)
movq %rdx, a+8(%rip)
ret
with this patch we now generate:
movdqa b(%rip), %xmm0
pslldq $2, %xmm0
movaps %xmm0, a(%rip)
ret
2022-08-03 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.cc (compute_convert_gain): Add gain
for converting suitable TImode shift to a V1TImode shift.
(timode_scalar_chain::convert_insn): Add support for converting
suitable ASHIFT and LSHIFTRT.
(timode_scalar_to_vector_candidate_p): Consider logical shifts
by integer constants that are multiples of 8 to be candidates.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-7.c: New test case.
2022-08-03 10:00:20 +02:00
|
|
|
case ASHIFT:
|
|
|
|
case LSHIFTRT:
|
|
|
|
/* Handle logical shifts by integer constants between 0 and 120
|
|
|
|
that are multiples of 8. */
|
|
|
|
return REG_P (XEXP (src, 0))
|
|
|
|
&& CONST_INT_P (XEXP (src, 1))
|
|
|
|
&& (INTVAL (XEXP (src, 1)) & ~0x78) == 0;
|
|
|
|
|
Improved Scalar-To-Vector (STV) support for TImode to V1TImode on x86_64.
This patch upgrades x86_64's scalar-to-vector (STV) pass to more
aggressively transform 128-bit scalar TImode operations into vector
V1TImode operations performed on SSE registers. TImode functionality
already exists in STV, but only for move operations. This change
brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
and comparisons.
The effect of these changes are conveniently demonstrated by the new
sse4_1-stv-5.c test case:
__int128 a[16];
__int128 b[16];
__int128 c[16];
void foo()
{
for (unsigned int i=0; i<16; i++)
a[i] = b[i] & ~c[i];
}
which when currently compiled on mainline wtih -O2 -msse4 produces:
foo: xorl %eax, %eax
.L2: movq c(%rax), %rsi
movq c+8(%rax), %rdi
addq $16, %rax
notq %rsi
notq %rdi
andq b-16(%rax), %rsi
andq b-8(%rax), %rdi
movq %rsi, a-16(%rax)
movq %rdi, a-8(%rax)
cmpq $256, %rax
jne .L2
ret
but with this patch now produces:
foo: xorl %eax, %eax
.L2: movdqa c(%rax), %xmm0
pandn b(%rax), %xmm0
addq $16, %rax
movaps %xmm0, a-16(%rax)
cmpq $256, %rax
jne .L2
ret
Technically, the STV pass is implemented by three C++ classes, a common
abstract base class "scalar_chain" that contains common functionality,
and two derived classes: general_scalar_chain (which handles SI and
DI modes) and timode_scalar_chain (which handles TI modes). As
mentioned previously, because only TI mode moves were handled the
two worker classes behaved significantly differently. These changes
bring the functionality of these two classes closer together, which
is reflected by refactoring more shared code from general_scalar_chain
to the parent scalar_chain and reusing it from timode_scalar_chain.
There still remain significant differences (and simplifications) so
the existing division of classes (as specializations) continues to
make sense.
2022-07-11 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-features.h (scalar_chain): Add fields
insns_conv, n_sse_to_integer and n_integer_to_sse to this
parent class, moved from general_scalar_chain.
(scalar_chain::convert_compare): Protected method moved
from general_scalar_chain.
(mark_dual_mode_def): Make protected, not private virtual.
(scalar_chain:convert_op): New private virtual method.
(general_scalar_chain::general_scalar_chain): Simplify constructor.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(general_scalar_chain): Move insns_conv, n_sse_to_integer and
n_integer_to_sse fields to parent class, scalar_chain.
(general_scalar_chain::mark_dual_mode_def): Delete prototype.
(general_scalar_chain::convert_compare): Delete prototype.
(timode_scalar_chain::compute_convert_gain): Remove simplistic
implementation, convert to a method prototype.
(timode_scalar_chain::mark_dual_mode_def): Delete prototype.
(timode_scalar_chain::convert_op): Prototype new virtual method.
* config/i386/i386-features.cc (scalar_chain::scalar_chain):
Allocate insns_conv and initialize n_sse_to_integer and
n_integer_to_sse fields in constructor.
(scalar_chain::scalar_chain): Free insns_conv in destructor.
(general_scalar_chain::general_scalar_chain): Delete
constructor, now defined in the class declaration.
(general_scalar_chain::~general_scalar_chain): Delete destructor.
(scalar_chain::mark_dual_mode_def): Renamed from
general_scalar_chain::mark_dual_mode_def.
(timode_scalar_chain::mark_dual_mode_def): Delete.
(scalar_chain::convert_compare): Renamed from
general_scalar_chain::convert_compare.
(timode_scalar_chain::compute_convert_gain): New method to
determine the gain from converting a TImode chain to V1TImode.
(timode_scalar_chain::convert_op): New method to convert an
operand from TImode to V1TImode.
(timode_scalar_chain::convert_insn) <case REG>: Only PUT_MODE
on REG_EQUAL notes that were originally TImode (not CONST_INT).
Handle AND, ANDN, XOR, IOR, NOT and COMPARE.
(timode_mem_p): Helper predicate to check where operand is
memory reference with sufficient alignment for TImode STV.
(timode_scalar_to_vector_candidate_p): Use convertible_comparison_p
to check whether COMPARE is convertible. Handle SET_DESTs that
that are REG_P or MEM_P and SET_SRCs that are REG, CONST_INT,
CONST_WIDE_INT, MEM, AND, ANDN, IOR, XOR or NOT.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse4_1-stv-2.c: New test case, pand.
* gcc.target/i386/sse4_1-stv-3.c: New test case, por.
* gcc.target/i386/sse4_1-stv-4.c: New test case, pxor.
* gcc.target/i386/sse4_1-stv-5.c: New test case, pandn.
* gcc.target/i386/sse4_1-stv-6.c: New test case, ptest.
2022-07-11 17:04:46 +02:00
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* For a register REGNO, scan instructions for its defs and uses.
|
|
|
|
Put REGNO in REGS if a def or use isn't in CANDIDATES. */
|
|
|
|
|
|
|
|
static void
|
|
|
|
timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
|
|
|
|
unsigned int regno)
|
|
|
|
{
|
2022-07-31 22:44:51 +02:00
|
|
|
/* Do nothing if REGNO is already in REGS or is a hard reg. */
|
|
|
|
if (bitmap_bit_p (regs, regno)
|
|
|
|
|| HARD_REGISTER_NUM_P (regno))
|
|
|
|
return;
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
for (df_ref def = DF_REG_DEF_CHAIN (regno);
|
|
|
|
def;
|
|
|
|
def = DF_REF_NEXT_REG (def))
|
|
|
|
{
|
|
|
|
if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
|
|
|
|
{
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file,
|
|
|
|
"r%d has non convertible def in insn %d\n",
|
|
|
|
regno, DF_REF_INSN_UID (def));
|
|
|
|
|
|
|
|
bitmap_set_bit (regs, regno);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (df_ref ref = DF_REG_USE_CHAIN (regno);
|
|
|
|
ref;
|
|
|
|
ref = DF_REF_NEXT_REG (ref))
|
|
|
|
{
|
|
|
|
/* Debug instructions are skipped. */
|
|
|
|
if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
|
|
|
|
&& !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
|
|
|
|
{
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file,
|
|
|
|
"r%d has non convertible use in insn %d\n",
|
|
|
|
regno, DF_REF_INSN_UID (ref));
|
|
|
|
|
|
|
|
bitmap_set_bit (regs, regno);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-31 22:44:51 +02:00
|
|
|
/* For a given bitmap of insn UIDs scans all instructions and
|
|
|
|
remove insn from CANDIDATES in case it has both convertible
|
|
|
|
and not convertible definitions.
|
|
|
|
|
|
|
|
All insns in a bitmap are conversion candidates according to
|
|
|
|
scalar_to_vector_candidate_p. Currently it implies all insns
|
|
|
|
are single_set. */
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
static void
|
|
|
|
timode_remove_non_convertible_regs (bitmap candidates)
|
|
|
|
{
|
|
|
|
bitmap_iterator bi;
|
|
|
|
unsigned id;
|
|
|
|
bitmap regs = BITMAP_ALLOC (NULL);
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
bool changed;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
do {
|
|
|
|
changed = false;
|
|
|
|
EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
|
|
|
|
{
|
2022-07-31 22:44:51 +02:00
|
|
|
rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
|
|
|
|
df_ref ref;
|
|
|
|
|
|
|
|
FOR_EACH_INSN_DEF (ref, insn)
|
|
|
|
if (!DF_REF_REG_MEM_P (ref)
|
|
|
|
&& GET_MODE (DF_REF_REG (ref)) == TImode)
|
|
|
|
timode_check_non_convertible_regs (candidates, regs,
|
|
|
|
DF_REF_REGNO (ref));
|
|
|
|
|
|
|
|
FOR_EACH_INSN_USE (ref, insn)
|
|
|
|
if (!DF_REF_REG_MEM_P (ref)
|
|
|
|
&& GET_MODE (DF_REF_REG (ref)) == TImode)
|
|
|
|
timode_check_non_convertible_regs (candidates, regs,
|
|
|
|
DF_REF_REGNO (ref));
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
}
|
2019-05-06 09:18:26 +02:00
|
|
|
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
|
|
|
|
{
|
|
|
|
for (df_ref def = DF_REG_DEF_CHAIN (id);
|
|
|
|
def;
|
|
|
|
def = DF_REF_NEXT_REG (def))
|
|
|
|
if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
|
|
|
|
{
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, "Removing insn %d from candidates list\n",
|
|
|
|
DF_REF_INSN_UID (def));
|
2019-05-06 09:18:26 +02:00
|
|
|
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
|
|
|
|
changed = true;
|
|
|
|
}
|
2019-05-06 09:18:26 +02:00
|
|
|
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
for (df_ref ref = DF_REG_USE_CHAIN (id);
|
|
|
|
ref;
|
|
|
|
ref = DF_REF_NEXT_REG (ref))
|
|
|
|
if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
|
|
|
|
{
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, "Removing insn %d from candidates list\n",
|
|
|
|
DF_REF_INSN_UID (ref));
|
2019-05-06 09:18:26 +02:00
|
|
|
|
PR target/106303: Fix TImode STV related failures on x86.
This patch resolves PR target/106303 (and the related PRs 106347,
106404, 106407) which are ICEs caused by my improvements to x86_64's
128-bit TImode to V1TImode Scalar to Vector (STV) pass. My apologies
for the breakage. The issue is that data flow analysis is used to
partition usage of each TImode pseudo into "chains", where each
chain is analyzed and if suitable converted to vector operations.
The problems appears when some chains for a pseudo are converted,
and others aren't as RTL sharing can result in some mode changes
leaking into other instructions that aren't/shouldn't/can't be
converted, which eventually leads to an ICE for mismatched modes.
My first approach to a fix was to unify more of the STV infrastructure,
reasoning that if TImode STV was exhibiting these problems, but DImode
and SImode STV weren't, the issue was likely to be caused/resolved by
these remaining differences. This appeared to fix some but not all of
the reported PRs. A better solution was then proposed by H.J. Lu in
Bugzilla, that we need to iterate the removal of candidates in the
function timode_remove_non_convertible_regs until there are no further
changes. As each chain is removed from consideration, it in turn may
affect whether other insns/chains can safely be converted.
2022-07-24 Roger Sayle <roger@nextmovesoftware.com>
H.J. Lu <hjl.tools@gmail.com>
gcc/ChangeLog
PR target/106303
PR target/106347
* config/i386/i386-features.cc (make_vector_copies): Move from
general_scalar_chain to scalar_chain.
(convert_reg): Likewise.
(convert_insn_common): New scalar_chain method split out from
general_scalar_chain convert_insn.
(convert_registers): Move from general_scalar_chain to
scalar_chain.
(scalar_chain::convert): Call convert_insn_common before calling
convert_insn.
(timode_remove_non_convertible_regs): Iterate until there are
no further changes to the candidates.
* config/i386/i386-features.h (scalar_chain::hash_map): Move
from general_scalar_chain.
(scalar_chain::convert_reg): Likewise.
(scalar_chain::convert_insn_common): New shared method.
(scalar_chain::make_vector_copies): Move from general_scalar_chain.
(scalar_chain::convert_registers): Likewise. No longer virtual.
(general_scalar_chain::hash_map): Delete. Moved to scalar_chain.
(general_scalar_chain::convert_reg): Likewise.
(general_scalar_chain::make_vector_copies): Likewise.
(general_scalar_chain::convert_registers): Delete virtual method.
(timode_scalar_chain::convert_registers): Likewise.
gcc/testsuite/ChangeLog
PR target/106303
PR target/106347
* gcc.target/i386/pr106303.c: New test case.
* gcc.target/i386/pr106347.c: New test case.
2022-07-24 13:22:22 +02:00
|
|
|
bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
|
|
|
|
changed = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} while (changed);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
BITMAP_FREE (regs);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Main STV pass function. Find and convert scalar
|
|
|
|
instructions into vector mode when profitable. */
|
|
|
|
|
|
|
|
static unsigned int
|
2019-08-20 10:45:56 +02:00
|
|
|
convert_scalars_to_vector (bool timode_p)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
basic_block bb;
|
|
|
|
int converted_insns = 0;
|
|
|
|
|
|
|
|
bitmap_obstack_initialize (NULL);
|
2019-08-14 14:04:05 +02:00
|
|
|
const machine_mode cand_mode[3] = { SImode, DImode, TImode };
|
|
|
|
const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
|
|
|
|
bitmap_head candidates[3]; /* { SImode, DImode, TImode } */
|
|
|
|
for (unsigned i = 0; i < 3; ++i)
|
|
|
|
bitmap_initialize (&candidates[i], &bitmap_default_obstack);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
calculate_dominance_info (CDI_DOMINATORS);
|
2021-02-01 09:18:43 +01:00
|
|
|
df_set_flags (DF_DEFER_INSN_RESCAN | DF_RD_PRUNE_DEAD_DEFS);
|
2019-05-06 09:18:26 +02:00
|
|
|
df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
|
|
|
|
df_analyze ();
|
|
|
|
|
|
|
|
/* Find all instructions we want to convert into vector mode. */
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, "Searching for mode conversion candidates...\n");
|
|
|
|
|
|
|
|
FOR_EACH_BB_FN (bb, cfun)
|
|
|
|
{
|
|
|
|
rtx_insn *insn;
|
|
|
|
FOR_BB_INSNS (bb, insn)
|
2019-08-20 10:45:56 +02:00
|
|
|
if (timode_p
|
2019-08-14 14:04:05 +02:00
|
|
|
&& timode_scalar_to_vector_candidate_p (insn))
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
if (dump_file)
|
2019-08-14 14:04:05 +02:00
|
|
|
fprintf (dump_file, " insn %d is marked as a TImode candidate\n",
|
2019-05-06 09:18:26 +02:00
|
|
|
INSN_UID (insn));
|
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
bitmap_set_bit (&candidates[2], INSN_UID (insn));
|
|
|
|
}
|
2019-08-20 10:45:56 +02:00
|
|
|
else if (!timode_p)
|
2019-08-14 14:04:05 +02:00
|
|
|
{
|
|
|
|
/* Check {SI,DI}mode. */
|
|
|
|
for (unsigned i = 0; i <= 1; ++i)
|
|
|
|
if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
|
|
|
|
{
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, " insn %d is marked as a %s candidate\n",
|
|
|
|
INSN_UID (insn), i == 0 ? "SImode" : "DImode");
|
|
|
|
|
|
|
|
bitmap_set_bit (&candidates[i], INSN_UID (insn));
|
|
|
|
break;
|
|
|
|
}
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-20 10:45:56 +02:00
|
|
|
if (timode_p)
|
2019-08-14 14:04:05 +02:00
|
|
|
timode_remove_non_convertible_regs (&candidates[2]);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
for (unsigned i = 0; i <= 2; ++i)
|
|
|
|
if (!bitmap_empty_p (&candidates[i]))
|
|
|
|
break;
|
|
|
|
else if (i == 2 && dump_file)
|
2019-05-06 09:18:26 +02:00
|
|
|
fprintf (dump_file, "There are no candidates for optimization.\n");
|
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
for (unsigned i = 0; i <= 2; ++i)
|
|
|
|
while (!bitmap_empty_p (&candidates[i]))
|
|
|
|
{
|
|
|
|
unsigned uid = bitmap_first_set_bit (&candidates[i]);
|
|
|
|
scalar_chain *chain;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
if (cand_mode[i] == TImode)
|
|
|
|
chain = new timode_scalar_chain;
|
|
|
|
else
|
|
|
|
chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
/* Find instructions chain we want to convert to vector mode.
|
|
|
|
Check all uses and definitions to estimate all required
|
|
|
|
conversions. */
|
|
|
|
chain->build (&candidates[i], uid);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
if (chain->compute_convert_gain () > 0)
|
|
|
|
converted_insns += chain->convert ();
|
|
|
|
else
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, "Chain #%d conversion is not profitable\n",
|
|
|
|
chain->chain_id);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
delete chain;
|
|
|
|
}
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
|
|
|
|
|
2019-08-14 14:04:05 +02:00
|
|
|
for (unsigned i = 0; i <= 2; ++i)
|
|
|
|
bitmap_release (&candidates[i]);
|
2019-05-06 09:18:26 +02:00
|
|
|
bitmap_obstack_release (NULL);
|
|
|
|
df_process_deferred_rescans ();
|
|
|
|
|
|
|
|
/* Conversion means we may have 128bit register spills/fills
|
|
|
|
which require aligned stack. */
|
|
|
|
if (converted_insns)
|
|
|
|
{
|
|
|
|
if (crtl->stack_alignment_needed < 128)
|
|
|
|
crtl->stack_alignment_needed = 128;
|
|
|
|
if (crtl->stack_alignment_estimated < 128)
|
|
|
|
crtl->stack_alignment_estimated = 128;
|
2019-08-27 19:23:59 +02:00
|
|
|
|
|
|
|
crtl->stack_realign_needed
|
|
|
|
= INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
|
|
|
|
crtl->stack_realign_tried = crtl->stack_realign_needed;
|
|
|
|
|
|
|
|
crtl->stack_realign_processed = true;
|
|
|
|
|
|
|
|
if (!crtl->drap_reg)
|
|
|
|
{
|
|
|
|
rtx drap_rtx = targetm.calls.get_drap_rtx ();
|
|
|
|
|
|
|
|
/* stack_realign_drap and drap_rtx must match. */
|
|
|
|
gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
|
|
|
|
|
|
|
|
/* Do nothing if NULL is returned,
|
|
|
|
which means DRAP is not needed. */
|
|
|
|
if (drap_rtx != NULL)
|
|
|
|
{
|
|
|
|
crtl->args.internal_arg_pointer = drap_rtx;
|
|
|
|
|
|
|
|
/* Call fixup_tail_calls to clean up
|
|
|
|
REG_EQUIV note if DRAP is needed. */
|
|
|
|
fixup_tail_calls ();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
/* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
|
|
|
|
if (TARGET_64BIT)
|
|
|
|
for (tree parm = DECL_ARGUMENTS (current_function_decl);
|
|
|
|
parm; parm = DECL_CHAIN (parm))
|
|
|
|
{
|
|
|
|
if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
|
|
|
|
continue;
|
|
|
|
if (DECL_RTL_SET_P (parm)
|
|
|
|
&& GET_MODE (DECL_RTL (parm)) == V1TImode)
|
|
|
|
{
|
|
|
|
rtx r = DECL_RTL (parm);
|
|
|
|
if (REG_P (r))
|
|
|
|
SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
|
|
|
|
}
|
|
|
|
if (DECL_INCOMING_RTL (parm)
|
|
|
|
&& GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
|
|
|
|
{
|
|
|
|
rtx r = DECL_INCOMING_RTL (parm);
|
|
|
|
if (REG_P (r))
|
|
|
|
DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned int
|
|
|
|
rest_of_handle_insert_vzeroupper (void)
|
|
|
|
{
|
2021-06-01 03:09:44 +02:00
|
|
|
/* vzeroupper instructions are inserted immediately after reload to
|
|
|
|
account for possible spills from 256bit or 512bit registers. The pass
|
|
|
|
reuses mode switching infrastructure by re-running mode insertion
|
|
|
|
pass, so disable entities that have already been processed. */
|
|
|
|
for (int i = 0; i < MAX_386_ENTITIES; i++)
|
|
|
|
ix86_optimize_mode_switching[i] = 0;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2021-06-01 03:09:44 +02:00
|
|
|
ix86_optimize_mode_switching[AVX_U128] = 1;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2021-06-01 03:09:44 +02:00
|
|
|
/* Call optimize_mode_switching. */
|
|
|
|
g->get_passes ()->execute_pass_mode_switching ();
|
|
|
|
|
|
|
|
df_analyze ();
|
2019-05-06 09:18:26 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
const pass_data pass_data_insert_vzeroupper =
|
|
|
|
{
|
|
|
|
RTL_PASS, /* type */
|
|
|
|
"vzeroupper", /* name */
|
|
|
|
OPTGROUP_NONE, /* optinfo_flags */
|
|
|
|
TV_MACH_DEP, /* tv_id */
|
|
|
|
0, /* properties_required */
|
|
|
|
0, /* properties_provided */
|
|
|
|
0, /* properties_destroyed */
|
|
|
|
0, /* todo_flags_start */
|
|
|
|
TODO_df_finish, /* todo_flags_finish */
|
|
|
|
};
|
|
|
|
|
|
|
|
class pass_insert_vzeroupper : public rtl_opt_pass
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
pass_insert_vzeroupper(gcc::context *ctxt)
|
|
|
|
: rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
|
|
|
|
{}
|
|
|
|
|
|
|
|
/* opt_pass methods: */
|
2022-06-27 23:00:33 +02:00
|
|
|
bool gate (function *) final override
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2021-06-01 03:09:44 +02:00
|
|
|
return TARGET_AVX && TARGET_VZEROUPPER
|
|
|
|
&& flag_expensive_optimizations && !optimize_size;
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
2022-06-27 23:00:33 +02:00
|
|
|
unsigned int execute (function *) final override
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
return rest_of_handle_insert_vzeroupper ();
|
|
|
|
}
|
|
|
|
|
|
|
|
}; // class pass_insert_vzeroupper
|
|
|
|
|
|
|
|
const pass_data pass_data_stv =
|
|
|
|
{
|
|
|
|
RTL_PASS, /* type */
|
|
|
|
"stv", /* name */
|
|
|
|
OPTGROUP_NONE, /* optinfo_flags */
|
|
|
|
TV_MACH_DEP, /* tv_id */
|
|
|
|
0, /* properties_required */
|
|
|
|
0, /* properties_provided */
|
|
|
|
0, /* properties_destroyed */
|
|
|
|
0, /* todo_flags_start */
|
|
|
|
TODO_df_finish, /* todo_flags_finish */
|
|
|
|
};
|
|
|
|
|
|
|
|
class pass_stv : public rtl_opt_pass
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
pass_stv (gcc::context *ctxt)
|
|
|
|
: rtl_opt_pass (pass_data_stv, ctxt),
|
|
|
|
timode_p (false)
|
|
|
|
{}
|
|
|
|
|
|
|
|
/* opt_pass methods: */
|
2022-06-27 23:00:33 +02:00
|
|
|
bool gate (function *) final override
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2019-08-20 10:45:56 +02:00
|
|
|
return ((!timode_p || TARGET_64BIT)
|
2019-05-06 09:18:26 +02:00
|
|
|
&& TARGET_STV && TARGET_SSE2 && optimize > 1);
|
|
|
|
}
|
|
|
|
|
2022-06-27 23:00:33 +02:00
|
|
|
unsigned int execute (function *) final override
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2019-08-20 10:45:56 +02:00
|
|
|
return convert_scalars_to_vector (timode_p);
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
2022-06-27 23:00:33 +02:00
|
|
|
opt_pass *clone () final override
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
return new pass_stv (m_ctxt);
|
|
|
|
}
|
|
|
|
|
2022-06-27 23:00:33 +02:00
|
|
|
void set_pass_param (unsigned int n, bool param) final override
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
gcc_assert (n == 0);
|
|
|
|
timode_p = param;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
bool timode_p;
|
|
|
|
}; // class pass_stv
|
|
|
|
|
|
|
|
} // anon namespace
|
|
|
|
|
|
|
|
rtl_opt_pass *
|
|
|
|
make_pass_insert_vzeroupper (gcc::context *ctxt)
|
|
|
|
{
|
|
|
|
return new pass_insert_vzeroupper (ctxt);
|
|
|
|
}
|
|
|
|
|
|
|
|
rtl_opt_pass *
|
|
|
|
make_pass_stv (gcc::context *ctxt)
|
|
|
|
{
|
|
|
|
return new pass_stv (ctxt);
|
|
|
|
}
|
|
|
|
|
2020-02-03 19:22:57 +01:00
|
|
|
/* Inserting ENDBR and pseudo patchable-area instructions. */
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2020-02-03 19:22:57 +01:00
|
|
|
static void
|
|
|
|
rest_of_insert_endbr_and_patchable_area (bool need_endbr,
|
|
|
|
unsigned int patchable_area_size)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2020-02-03 19:22:57 +01:00
|
|
|
rtx endbr;
|
2019-05-06 09:18:26 +02:00
|
|
|
rtx_insn *insn;
|
2020-02-03 19:22:57 +01:00
|
|
|
rtx_insn *endbr_insn = NULL;
|
2019-05-06 09:18:26 +02:00
|
|
|
basic_block bb;
|
|
|
|
|
2020-02-03 19:22:57 +01:00
|
|
|
if (need_endbr)
|
|
|
|
{
|
|
|
|
/* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
|
|
|
|
is absent among function attributes. Later an optimization will
|
|
|
|
be introduced to make analysis if an address of a static function
|
|
|
|
is taken. A static function whose address is not taken will get
|
|
|
|
a nocf_check attribute. This will allow to reduce the number of
|
|
|
|
EB. */
|
|
|
|
if (!lookup_attribute ("nocf_check",
|
|
|
|
TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
|
|
|
|
&& (!flag_manual_endbr
|
|
|
|
|| lookup_attribute ("cf_check",
|
|
|
|
DECL_ATTRIBUTES (cfun->decl)))
|
|
|
|
&& (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
|
|
|
|
|| ix86_cmodel == CM_LARGE
|
|
|
|
|| ix86_cmodel == CM_LARGE_PIC
|
|
|
|
|| flag_force_indirect_call
|
|
|
|
|| (TARGET_DLLIMPORT_DECL_ATTRIBUTES
|
|
|
|
&& DECL_DLLIMPORT_P (cfun->decl))))
|
|
|
|
{
|
|
|
|
if (crtl->profile && flag_fentry)
|
|
|
|
{
|
|
|
|
/* Queue ENDBR insertion to x86_function_profiler.
|
|
|
|
NB: Any patchable-area insn will be inserted after
|
|
|
|
ENDBR. */
|
|
|
|
cfun->machine->insn_queued_at_entrance = TYPE_ENDBR;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
endbr = gen_nop_endbr ();
|
|
|
|
bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
|
|
|
|
rtx_insn *insn = BB_HEAD (bb);
|
|
|
|
endbr_insn = emit_insn_before (endbr, insn);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (patchable_area_size)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
if (crtl->profile && flag_fentry)
|
2020-02-03 19:22:57 +01:00
|
|
|
{
|
|
|
|
/* Queue patchable-area insertion to x86_function_profiler.
|
|
|
|
NB: If there is a queued ENDBR, x86_function_profiler
|
|
|
|
will also handle patchable-area. */
|
|
|
|
if (!cfun->machine->insn_queued_at_entrance)
|
|
|
|
cfun->machine->insn_queued_at_entrance = TYPE_PATCHABLE_AREA;
|
|
|
|
}
|
2019-05-06 09:18:26 +02:00
|
|
|
else
|
|
|
|
{
|
2020-02-03 19:22:57 +01:00
|
|
|
rtx patchable_area
|
|
|
|
= gen_patchable_area (GEN_INT (patchable_area_size),
|
|
|
|
GEN_INT (crtl->patch_area_entry == 0));
|
|
|
|
if (endbr_insn)
|
|
|
|
emit_insn_after (patchable_area, endbr_insn);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
|
|
|
|
insn = BB_HEAD (bb);
|
|
|
|
emit_insn_before (patchable_area, insn);
|
|
|
|
}
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-03 19:22:57 +01:00
|
|
|
if (!need_endbr)
|
|
|
|
return;
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
bb = 0;
|
|
|
|
FOR_EACH_BB_FN (bb, cfun)
|
|
|
|
{
|
|
|
|
for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
|
|
|
|
insn = NEXT_INSN (insn))
|
|
|
|
{
|
|
|
|
if (CALL_P (insn))
|
|
|
|
{
|
|
|
|
need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
|
|
|
|
if (!need_endbr && !SIBLING_CALL_P (insn))
|
|
|
|
{
|
|
|
|
rtx call = get_call_rtx_from (insn);
|
|
|
|
rtx fnaddr = XEXP (call, 0);
|
|
|
|
tree fndecl = NULL_TREE;
|
|
|
|
|
|
|
|
/* Also generate ENDBRANCH for non-tail call which
|
|
|
|
may return via indirect branch. */
|
|
|
|
if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
|
|
|
|
fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
|
|
|
|
if (fndecl == NULL_TREE)
|
|
|
|
fndecl = MEM_EXPR (fnaddr);
|
|
|
|
if (fndecl
|
|
|
|
&& TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
|
|
|
|
&& TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
|
|
|
|
fndecl = NULL_TREE;
|
|
|
|
if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
|
|
|
|
{
|
|
|
|
tree fntype = TREE_TYPE (fndecl);
|
|
|
|
if (lookup_attribute ("indirect_return",
|
|
|
|
TYPE_ATTRIBUTES (fntype)))
|
|
|
|
need_endbr = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!need_endbr)
|
|
|
|
continue;
|
|
|
|
/* Generate ENDBRANCH after CALL, which can return more than
|
|
|
|
twice, setjmp-like functions. */
|
|
|
|
|
2020-02-03 19:22:57 +01:00
|
|
|
endbr = gen_nop_endbr ();
|
|
|
|
emit_insn_after_setloc (endbr, insn, INSN_LOCATION (insn));
|
2019-05-06 09:18:26 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (JUMP_P (insn) && flag_cet_switch)
|
|
|
|
{
|
|
|
|
rtx target = JUMP_LABEL (insn);
|
|
|
|
if (target == NULL_RTX || ANY_RETURN_P (target))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Check the jump is a switch table. */
|
|
|
|
rtx_insn *label = as_a<rtx_insn *> (target);
|
|
|
|
rtx_insn *table = next_insn (label);
|
|
|
|
if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* For the indirect jump find out all places it jumps and insert
|
|
|
|
ENDBRANCH there. It should be done under a special flag to
|
|
|
|
control ENDBRANCH generation for switch stmts. */
|
|
|
|
edge_iterator ei;
|
|
|
|
edge e;
|
|
|
|
basic_block dest_blk;
|
|
|
|
|
|
|
|
FOR_EACH_EDGE (e, ei, bb->succs)
|
|
|
|
{
|
|
|
|
rtx_insn *insn;
|
|
|
|
|
|
|
|
dest_blk = e->dest;
|
|
|
|
insn = BB_HEAD (dest_blk);
|
|
|
|
gcc_assert (LABEL_P (insn));
|
2020-02-03 19:22:57 +01:00
|
|
|
endbr = gen_nop_endbr ();
|
|
|
|
emit_insn_after (endbr, insn);
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2019-06-01 01:59:16 +02:00
|
|
|
if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2020-02-03 19:22:57 +01:00
|
|
|
endbr = gen_nop_endbr ();
|
|
|
|
emit_insn_after (endbr, insn);
|
2019-05-06 09:18:26 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-03 19:22:57 +01:00
|
|
|
return;
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
2020-02-03 19:22:57 +01:00
|
|
|
const pass_data pass_data_insert_endbr_and_patchable_area =
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
RTL_PASS, /* type. */
|
2020-02-03 19:22:57 +01:00
|
|
|
"endbr_and_patchable_area", /* name. */
|
2019-05-06 09:18:26 +02:00
|
|
|
OPTGROUP_NONE, /* optinfo_flags. */
|
|
|
|
TV_MACH_DEP, /* tv_id. */
|
|
|
|
0, /* properties_required. */
|
|
|
|
0, /* properties_provided. */
|
|
|
|
0, /* properties_destroyed. */
|
|
|
|
0, /* todo_flags_start. */
|
|
|
|
0, /* todo_flags_finish. */
|
|
|
|
};
|
|
|
|
|
2020-02-03 19:22:57 +01:00
|
|
|
class pass_insert_endbr_and_patchable_area : public rtl_opt_pass
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
public:
|
2020-02-03 19:22:57 +01:00
|
|
|
pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
|
|
|
|
: rtl_opt_pass (pass_data_insert_endbr_and_patchable_area, ctxt)
|
2019-05-06 09:18:26 +02:00
|
|
|
{}
|
|
|
|
|
|
|
|
/* opt_pass methods: */
|
2022-06-27 23:00:33 +02:00
|
|
|
bool gate (function *) final override
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2020-02-03 19:22:57 +01:00
|
|
|
need_endbr = (flag_cf_protection & CF_BRANCH) != 0;
|
|
|
|
patchable_area_size = crtl->patch_area_size - crtl->patch_area_entry;
|
|
|
|
return need_endbr || patchable_area_size;
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
2022-06-27 23:00:33 +02:00
|
|
|
unsigned int execute (function *) final override
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2020-02-03 19:22:57 +01:00
|
|
|
timevar_push (TV_MACH_DEP);
|
|
|
|
rest_of_insert_endbr_and_patchable_area (need_endbr,
|
|
|
|
patchable_area_size);
|
|
|
|
timevar_pop (TV_MACH_DEP);
|
|
|
|
return 0;
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
2020-02-03 19:22:57 +01:00
|
|
|
private:
|
|
|
|
bool need_endbr;
|
|
|
|
unsigned int patchable_area_size;
|
|
|
|
}; // class pass_insert_endbr_and_patchable_area
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
} // anon namespace
|
|
|
|
|
|
|
|
rtl_opt_pass *
|
2020-02-03 19:22:57 +01:00
|
|
|
make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2020-02-03 19:22:57 +01:00
|
|
|
return new pass_insert_endbr_and_patchable_area (ctxt);
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* At entry of the nearest common dominator for basic blocks with
|
2021-09-18 04:49:54 +02:00
|
|
|
conversions/rcp/sqrt/rsqrt/round, generate a single
|
2019-05-06 09:18:26 +02:00
|
|
|
vxorps %xmmN, %xmmN, %xmmN
|
|
|
|
for all
|
|
|
|
vcvtss2sd op, %xmmN, %xmmX
|
|
|
|
vcvtsd2ss op, %xmmN, %xmmX
|
|
|
|
vcvtsi2ss op, %xmmN, %xmmX
|
|
|
|
vcvtsi2sd op, %xmmN, %xmmX
|
|
|
|
|
|
|
|
NB: We want to generate only a single vxorps to cover the whole
|
|
|
|
function. The LCM algorithm isn't appropriate here since it may
|
|
|
|
place a vxorps inside the loop. */
|
|
|
|
|
|
|
|
static unsigned int
|
|
|
|
remove_partial_avx_dependency (void)
|
|
|
|
{
|
|
|
|
timevar_push (TV_MACH_DEP);
|
|
|
|
|
|
|
|
bitmap_obstack_initialize (NULL);
|
|
|
|
bitmap convert_bbs = BITMAP_ALLOC (NULL);
|
|
|
|
|
|
|
|
basic_block bb;
|
|
|
|
rtx_insn *insn, *set_insn;
|
|
|
|
rtx set;
|
|
|
|
rtx v4sf_const0 = NULL_RTX;
|
|
|
|
|
|
|
|
auto_vec<rtx_insn *> control_flow_insns;
|
|
|
|
|
2021-01-29 16:02:36 +01:00
|
|
|
/* We create invalid RTL initially so defer rescans. */
|
|
|
|
df_set_flags (DF_DEFER_INSN_RESCAN);
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
FOR_EACH_BB_FN (bb, cfun)
|
|
|
|
{
|
|
|
|
FOR_BB_INSNS (bb, insn)
|
|
|
|
{
|
|
|
|
if (!NONDEBUG_INSN_P (insn))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
set = single_set (insn);
|
|
|
|
if (!set)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (get_attr_avx_partial_xmm_update (insn)
|
|
|
|
!= AVX_PARTIAL_XMM_UPDATE_TRUE)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
|
2021-09-18 04:49:54 +02:00
|
|
|
SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
|
|
|
|
round, to vec_dup and vec_merge with subreg. */
|
2019-05-06 09:18:26 +02:00
|
|
|
rtx src = SET_SRC (set);
|
|
|
|
rtx dest = SET_DEST (set);
|
|
|
|
machine_mode dest_mode = GET_MODE (dest);
|
2021-09-18 04:49:54 +02:00
|
|
|
bool convert_p = false;
|
|
|
|
switch (GET_CODE (src))
|
|
|
|
{
|
|
|
|
case FLOAT:
|
|
|
|
case FLOAT_EXTEND:
|
|
|
|
case FLOAT_TRUNCATE:
|
|
|
|
case UNSIGNED_FLOAT:
|
|
|
|
convert_p = true;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2021-09-15 08:17:58 +02:00
|
|
|
|
2021-09-18 04:49:54 +02:00
|
|
|
/* Only hanlde conversion here. */
|
|
|
|
machine_mode src_mode
|
|
|
|
= convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
|
2021-09-15 08:17:58 +02:00
|
|
|
switch (src_mode)
|
|
|
|
{
|
|
|
|
case E_SFmode:
|
|
|
|
case E_DFmode:
|
2021-09-15 08:18:21 +02:00
|
|
|
if (TARGET_USE_VECTOR_FP_CONVERTS
|
|
|
|
|| !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
|
2021-09-15 08:17:58 +02:00
|
|
|
continue;
|
|
|
|
break;
|
|
|
|
case E_SImode:
|
|
|
|
case E_DImode:
|
2021-09-15 08:18:21 +02:00
|
|
|
if (TARGET_USE_VECTOR_CONVERTS
|
|
|
|
|| !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
|
2021-09-15 08:17:58 +02:00
|
|
|
continue;
|
|
|
|
break;
|
2021-09-18 04:49:54 +02:00
|
|
|
case E_VOIDmode:
|
|
|
|
gcc_assert (!convert_p);
|
2021-09-15 08:17:58 +02:00
|
|
|
break;
|
2021-09-18 04:49:54 +02:00
|
|
|
default:
|
|
|
|
gcc_unreachable ();
|
2021-09-15 08:17:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!v4sf_const0)
|
|
|
|
v4sf_const0 = gen_reg_rtx (V4SFmode);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
rtx zero;
|
|
|
|
machine_mode dest_vecmode;
|
2020-07-10 09:22:30 +02:00
|
|
|
switch (dest_mode)
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2020-07-10 09:22:30 +02:00
|
|
|
case E_HFmode:
|
|
|
|
dest_vecmode = V8HFmode;
|
|
|
|
zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
|
|
|
|
break;
|
|
|
|
case E_SFmode:
|
2019-05-06 09:18:26 +02:00
|
|
|
dest_vecmode = V4SFmode;
|
|
|
|
zero = v4sf_const0;
|
2020-07-10 09:22:30 +02:00
|
|
|
break;
|
|
|
|
case E_DFmode:
|
2019-05-06 09:18:26 +02:00
|
|
|
dest_vecmode = V2DFmode;
|
|
|
|
zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
|
2020-07-10 09:22:30 +02:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
gcc_unreachable ();
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Change source to vector mode. */
|
|
|
|
src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
|
|
|
|
src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
|
|
|
|
GEN_INT (HOST_WIDE_INT_1U));
|
|
|
|
/* Change destination to vector mode. */
|
|
|
|
rtx vec = gen_reg_rtx (dest_vecmode);
|
|
|
|
/* Generate an XMM vector SET. */
|
|
|
|
set = gen_rtx_SET (vec, src);
|
|
|
|
set_insn = emit_insn_before (set, insn);
|
|
|
|
df_insn_rescan (set_insn);
|
|
|
|
|
|
|
|
if (cfun->can_throw_non_call_exceptions)
|
|
|
|
{
|
|
|
|
/* Handle REG_EH_REGION note. */
|
|
|
|
rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
|
|
|
|
if (note)
|
|
|
|
{
|
|
|
|
control_flow_insns.safe_push (set_insn);
|
|
|
|
add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
src = gen_rtx_SUBREG (dest_mode, vec, 0);
|
|
|
|
set = gen_rtx_SET (dest, src);
|
|
|
|
|
|
|
|
/* Drop possible dead definitions. */
|
|
|
|
PATTERN (insn) = set;
|
|
|
|
|
|
|
|
INSN_CODE (insn) = -1;
|
|
|
|
recog_memoized (insn);
|
|
|
|
df_insn_rescan (insn);
|
|
|
|
bitmap_set_bit (convert_bbs, bb->index);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (v4sf_const0)
|
|
|
|
{
|
|
|
|
/* (Re-)discover loops so that bb->loop_father can be used in the
|
|
|
|
analysis below. */
|
2021-01-29 16:02:36 +01:00
|
|
|
calculate_dominance_info (CDI_DOMINATORS);
|
2019-05-06 09:18:26 +02:00
|
|
|
loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
|
|
|
|
|
|
|
|
/* Generate a vxorps at entry of the nearest dominator for basic
|
Fix up duplicated duplicated words mostly in comments
In the r10-7197-gbae7b38cf8a21e068ad5c0bab089dedb78af3346 commit I've
noticed duplicated word in a message, which lead me to grep for those and
we have a tons of them.
I've used
grep -v 'long long\|optab optab\|template template\|double double' *.[chS] */*.[chS] *.def config/*/* 2>/dev/null | grep ' \([a-zA-Z]\+\) \1 '
Note, the command will not detect the doubled words at the start or end of
line or when one of the words is at the end of line and the next one at the
start of another one.
Some of it is fairly obvious, e.g. all the "the the" cases which is
something I've posted and committed patch for already e.g. in 2016,
other cases are often valid, e.g. "that that" seems to look mostly ok to me.
Some cases are quite hard to figure out, I've left out some of them from the
patch (e.g. "and and" in some cases isn't talking about bitwise/logical and
and so looks incorrect, but in other cases it is talking about those
operations).
In most cases the right solution seems to be to remove one of the duplicated
words, but not always.
I think most important are the ones with user visible messages (in the patch
3 of the first 4 hunks), the rest is just comments (and internal
documentation; for that see the doc/tm.texi changes).
2020-03-17 Jakub Jelinek <jakub@redhat.com>
* lra-spills.c (remove_pseudos): Fix up duplicated word issue in
a dump message.
* tree-sra.c (create_access_replacement): Fix up duplicated word issue
in a comment.
* read-rtl-function.c (find_param_by_name,
function_reader::parse_enum_value, function_reader::get_insn_by_uid):
Likewise.
* spellcheck.c (get_edit_distance_cutoff): Likewise.
* tree-data-ref.c (create_ifn_alias_checks): Likewise.
* tree.def (SWITCH_EXPR): Likewise.
* selftest.c (assert_str_contains): Likewise.
* ipa-param-manipulation.h (class ipa_param_body_adjustments):
Likewise.
* tree-ssa-math-opts.c (convert_expand_mult_copysign): Likewise.
* tree-ssa-loop-split.c (find_vdef_in_loop): Likewise.
* langhooks.h (struct lang_hooks_for_decls): Likewise.
* ipa-prop.h (struct ipa_param_descriptor): Likewise.
* tree-ssa-strlen.c (handle_builtin_string_cmp, handle_store):
Likewise.
* tree-ssa-dom.c (simplify_stmt_for_jump_threading): Likewise.
* tree-ssa-reassoc.c (reassociate_bb): Likewise.
* tree.c (component_ref_size): Likewise.
* hsa-common.c (hsa_init_compilation_unit_data): Likewise.
* gimple-ssa-sprintf.c (get_string_length, format_string,
format_directive): Likewise.
* omp-grid.c (grid_process_kernel_body_copy): Likewise.
* input.c (string_concat_db::get_string_concatenation,
test_lexer_string_locations_ucn4): Likewise.
* cfgexpand.c (pass_expand::execute): Likewise.
* gimple-ssa-warn-restrict.c (builtin_memref::offset_out_of_bounds,
maybe_diag_overlap): Likewise.
* rtl.c (RTX_CODE_HWINT_P_1): Likewise.
* shrink-wrap.c (spread_components): Likewise.
* tree-ssa-dse.c (initialize_ao_ref_for_dse, valid_ao_ref_for_dse):
Likewise.
* tree-call-cdce.c (shrink_wrap_one_built_in_call_with_conds):
Likewise.
* dwarf2out.c (dwarf2out_early_finish): Likewise.
* gimple-ssa-store-merging.c: Likewise.
* ira-costs.c (record_operand_costs): Likewise.
* tree-vect-loop.c (vectorizable_reduction): Likewise.
* target.def (dispatch): Likewise.
(validate_dims, gen_ccmp_first): Fix up duplicated word issue
in documentation text.
* doc/tm.texi: Regenerated.
* config/i386/x86-tune.def (X86_TUNE_PARTIAL_FLAG_REG_STALL): Fix up
duplicated word issue in a comment.
* config/i386/i386.c (ix86_test_loading_unspec): Likewise.
* config/i386/i386-features.c (remove_partial_avx_dependency):
Likewise.
* config/msp430/msp430.c (msp430_select_section): Likewise.
* config/gcn/gcn-run.c (load_image): Likewise.
* config/aarch64/aarch64-sve.md (sve_ld1r<mode>): Likewise.
* config/aarch64/aarch64.c (aarch64_gen_adjusted_ldpstp): Likewise.
* config/aarch64/falkor-tag-collision-avoidance.c
(single_dest_per_chain): Likewise.
* config/nvptx/nvptx.c (nvptx_record_fndecl): Likewise.
* config/fr30/fr30.c (fr30_arg_partial_bytes): Likewise.
* config/rs6000/rs6000-string.c (expand_cmp_vec_sequence): Likewise.
* config/rs6000/rs6000-p8swap.c (replace_swapped_load_constant):
Likewise.
* config/rs6000/rs6000-c.c (rs6000_target_modify_macros): Likewise.
* config/rs6000/rs6000.c (rs6000_option_override_internal): Likewise.
* config/rs6000/rs6000-logue.c
(rs6000_emit_probe_stack_range_stack_clash): Likewise.
* config/nds32/nds32-md-auxiliary.c (nds32_split_ashiftdi3): Likewise.
Fix various other issues in the comment.
c-family/
* c-common.c (resolve_overloaded_builtin): Fix up duplicated word
issue in a diagnostic message.
cp/
* pt.c (tsubst): Fix up duplicated word issue in a diagnostic message.
(lookup_template_class_1, tsubst_expr): Fix up duplicated word issue
in a comment.
* parser.c (cp_parser_statement, cp_parser_linkage_specification,
cp_parser_placeholder_type_specifier,
cp_parser_constraint_requires_parens): Likewise.
* name-lookup.c (suggest_alternative_in_explicit_scope): Likewise.
fortran/
* array.c (gfc_check_iter_variable): Fix up duplicated word issue
in a comment.
* arith.c (gfc_arith_concat): Likewise.
* resolve.c (gfc_resolve_ref): Likewise.
* frontend-passes.c (matmul_lhs_realloc): Likewise.
* module.c (gfc_match_submodule, load_needed): Likewise.
* trans-expr.c (gfc_init_se): Likewise.
2020-03-17 13:52:19 +01:00
|
|
|
blocks with conversions, which is in the fake loop that
|
2019-05-06 09:18:26 +02:00
|
|
|
contains the whole function, so that there is only a single
|
|
|
|
vxorps in the whole function. */
|
|
|
|
bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
|
|
|
|
convert_bbs);
|
|
|
|
while (bb->loop_father->latch
|
|
|
|
!= EXIT_BLOCK_PTR_FOR_FN (cfun))
|
|
|
|
bb = get_immediate_dominator (CDI_DOMINATORS,
|
|
|
|
bb->loop_father->header);
|
|
|
|
|
|
|
|
set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
|
|
|
|
|
|
|
|
insn = BB_HEAD (bb);
|
|
|
|
while (insn && !NONDEBUG_INSN_P (insn))
|
|
|
|
{
|
|
|
|
if (insn == BB_END (bb))
|
|
|
|
{
|
|
|
|
insn = NULL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
insn = NEXT_INSN (insn);
|
|
|
|
}
|
|
|
|
if (insn == BB_HEAD (bb))
|
|
|
|
set_insn = emit_insn_before (set, insn);
|
|
|
|
else
|
|
|
|
set_insn = emit_insn_after (set,
|
|
|
|
insn ? PREV_INSN (insn) : BB_END (bb));
|
|
|
|
df_insn_rescan (set_insn);
|
|
|
|
loop_optimizer_finalize ();
|
|
|
|
|
|
|
|
if (!control_flow_insns.is_empty ())
|
|
|
|
{
|
|
|
|
free_dominance_info (CDI_DOMINATORS);
|
|
|
|
|
|
|
|
unsigned int i;
|
|
|
|
FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
|
|
|
|
if (control_flow_insn_p (insn))
|
|
|
|
{
|
|
|
|
/* Split the block after insn. There will be a fallthru
|
|
|
|
edge, which is OK so we keep it. We have to create
|
|
|
|
the exception edges ourselves. */
|
|
|
|
bb = BLOCK_FOR_INSN (insn);
|
|
|
|
split_block (bb, insn);
|
|
|
|
rtl_make_eh_edge (NULL, bb, BB_END (bb));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-01-29 16:02:36 +01:00
|
|
|
df_process_deferred_rescans ();
|
2021-01-30 14:58:14 +01:00
|
|
|
df_clear_flags (DF_DEFER_INSN_RESCAN);
|
2019-05-06 09:18:26 +02:00
|
|
|
bitmap_obstack_release (NULL);
|
|
|
|
BITMAP_FREE (convert_bbs);
|
|
|
|
|
|
|
|
timevar_pop (TV_MACH_DEP);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
const pass_data pass_data_remove_partial_avx_dependency =
|
|
|
|
{
|
|
|
|
RTL_PASS, /* type */
|
|
|
|
"rpad", /* name */
|
|
|
|
OPTGROUP_NONE, /* optinfo_flags */
|
|
|
|
TV_MACH_DEP, /* tv_id */
|
|
|
|
0, /* properties_required */
|
|
|
|
0, /* properties_provided */
|
|
|
|
0, /* properties_destroyed */
|
|
|
|
0, /* todo_flags_start */
|
2021-01-29 16:02:36 +01:00
|
|
|
0, /* todo_flags_finish */
|
2019-05-06 09:18:26 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
class pass_remove_partial_avx_dependency : public rtl_opt_pass
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
pass_remove_partial_avx_dependency (gcc::context *ctxt)
|
|
|
|
: rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
|
|
|
|
{}
|
|
|
|
|
|
|
|
/* opt_pass methods: */
|
2022-06-27 23:00:33 +02:00
|
|
|
bool gate (function *) final override
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
2021-07-13 12:22:03 +02:00
|
|
|
return (TARGET_AVX
|
|
|
|
&& TARGET_SSE_PARTIAL_REG_DEPENDENCY
|
|
|
|
&& TARGET_SSE_MATH
|
|
|
|
&& optimize
|
|
|
|
&& optimize_function_for_speed_p (cfun));
|
2019-05-06 09:18:26 +02:00
|
|
|
}
|
|
|
|
|
2022-06-27 23:00:33 +02:00
|
|
|
unsigned int execute (function *) final override
|
2019-05-06 09:18:26 +02:00
|
|
|
{
|
|
|
|
return remove_partial_avx_dependency ();
|
|
|
|
}
|
|
|
|
}; // class pass_rpad
|
|
|
|
|
|
|
|
} // anon namespace
|
|
|
|
|
|
|
|
rtl_opt_pass *
|
|
|
|
make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
|
|
|
|
{
|
|
|
|
return new pass_remove_partial_avx_dependency (ctxt);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This compares the priority of target features in function DECL1
|
|
|
|
and DECL2. It returns positive value if DECL1 is higher priority,
|
|
|
|
negative value if DECL2 is higher priority and 0 if they are the
|
|
|
|
same. */
|
|
|
|
|
|
|
|
int
|
|
|
|
ix86_compare_version_priority (tree decl1, tree decl2)
|
|
|
|
{
|
|
|
|
unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
|
|
|
|
unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
|
|
|
|
|
|
|
|
return (int)priority1 - (int)priority2;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* V1 and V2 point to function versions with different priorities
|
|
|
|
based on the target ISA. This function compares their priorities. */
|
|
|
|
|
|
|
|
static int
|
|
|
|
feature_compare (const void *v1, const void *v2)
|
|
|
|
{
|
|
|
|
typedef struct _function_version_info
|
|
|
|
{
|
|
|
|
tree version_decl;
|
|
|
|
tree predicate_chain;
|
|
|
|
unsigned int dispatch_priority;
|
|
|
|
} function_version_info;
|
|
|
|
|
|
|
|
const function_version_info c1 = *(const function_version_info *)v1;
|
|
|
|
const function_version_info c2 = *(const function_version_info *)v2;
|
|
|
|
return (c2.dispatch_priority - c1.dispatch_priority);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
|
|
|
|
to return a pointer to VERSION_DECL if the outcome of the expression
|
|
|
|
formed by PREDICATE_CHAIN is true. This function will be called during
|
|
|
|
version dispatch to decide which function version to execute. It returns
|
|
|
|
the basic block at the end, to which more conditions can be added. */
|
|
|
|
|
|
|
|
static basic_block
|
|
|
|
add_condition_to_bb (tree function_decl, tree version_decl,
|
|
|
|
tree predicate_chain, basic_block new_bb)
|
|
|
|
{
|
|
|
|
gimple *return_stmt;
|
|
|
|
tree convert_expr, result_var;
|
|
|
|
gimple *convert_stmt;
|
|
|
|
gimple *call_cond_stmt;
|
|
|
|
gimple *if_else_stmt;
|
|
|
|
|
|
|
|
basic_block bb1, bb2, bb3;
|
|
|
|
edge e12, e23;
|
|
|
|
|
|
|
|
tree cond_var, and_expr_var = NULL_TREE;
|
|
|
|
gimple_seq gseq;
|
|
|
|
|
|
|
|
tree predicate_decl, predicate_arg;
|
|
|
|
|
|
|
|
push_cfun (DECL_STRUCT_FUNCTION (function_decl));
|
|
|
|
|
|
|
|
gcc_assert (new_bb != NULL);
|
|
|
|
gseq = bb_seq (new_bb);
|
|
|
|
|
|
|
|
|
|
|
|
convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
|
|
|
|
build_fold_addr_expr (version_decl));
|
|
|
|
result_var = create_tmp_var (ptr_type_node);
|
|
|
|
convert_stmt = gimple_build_assign (result_var, convert_expr);
|
|
|
|
return_stmt = gimple_build_return (result_var);
|
|
|
|
|
|
|
|
if (predicate_chain == NULL_TREE)
|
|
|
|
{
|
|
|
|
gimple_seq_add_stmt (&gseq, convert_stmt);
|
|
|
|
gimple_seq_add_stmt (&gseq, return_stmt);
|
|
|
|
set_bb_seq (new_bb, gseq);
|
|
|
|
gimple_set_bb (convert_stmt, new_bb);
|
|
|
|
gimple_set_bb (return_stmt, new_bb);
|
|
|
|
pop_cfun ();
|
|
|
|
return new_bb;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (predicate_chain != NULL)
|
|
|
|
{
|
|
|
|
cond_var = create_tmp_var (integer_type_node);
|
|
|
|
predicate_decl = TREE_PURPOSE (predicate_chain);
|
|
|
|
predicate_arg = TREE_VALUE (predicate_chain);
|
|
|
|
call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
|
|
|
|
gimple_call_set_lhs (call_cond_stmt, cond_var);
|
|
|
|
|
|
|
|
gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
|
|
|
|
gimple_set_bb (call_cond_stmt, new_bb);
|
|
|
|
gimple_seq_add_stmt (&gseq, call_cond_stmt);
|
|
|
|
|
|
|
|
predicate_chain = TREE_CHAIN (predicate_chain);
|
|
|
|
|
|
|
|
if (and_expr_var == NULL)
|
|
|
|
and_expr_var = cond_var;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
gimple *assign_stmt;
|
|
|
|
/* Use MIN_EXPR to check if any integer is zero?.
|
|
|
|
and_expr_var = min_expr <cond_var, and_expr_var> */
|
|
|
|
assign_stmt = gimple_build_assign (and_expr_var,
|
|
|
|
build2 (MIN_EXPR, integer_type_node,
|
|
|
|
cond_var, and_expr_var));
|
|
|
|
|
|
|
|
gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
|
|
|
|
gimple_set_bb (assign_stmt, new_bb);
|
|
|
|
gimple_seq_add_stmt (&gseq, assign_stmt);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
|
|
|
|
integer_zero_node,
|
|
|
|
NULL_TREE, NULL_TREE);
|
|
|
|
gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
|
|
|
|
gimple_set_bb (if_else_stmt, new_bb);
|
|
|
|
gimple_seq_add_stmt (&gseq, if_else_stmt);
|
|
|
|
|
|
|
|
gimple_seq_add_stmt (&gseq, convert_stmt);
|
|
|
|
gimple_seq_add_stmt (&gseq, return_stmt);
|
|
|
|
set_bb_seq (new_bb, gseq);
|
|
|
|
|
|
|
|
bb1 = new_bb;
|
|
|
|
e12 = split_block (bb1, if_else_stmt);
|
|
|
|
bb2 = e12->dest;
|
|
|
|
e12->flags &= ~EDGE_FALLTHRU;
|
|
|
|
e12->flags |= EDGE_TRUE_VALUE;
|
|
|
|
|
|
|
|
e23 = split_block (bb2, return_stmt);
|
|
|
|
|
|
|
|
gimple_set_bb (convert_stmt, bb2);
|
|
|
|
gimple_set_bb (return_stmt, bb2);
|
|
|
|
|
|
|
|
bb3 = e23->dest;
|
|
|
|
make_edge (bb1, bb3, EDGE_FALSE_VALUE);
|
|
|
|
|
|
|
|
remove_edge (e23);
|
|
|
|
make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
|
|
|
|
|
|
|
|
pop_cfun ();
|
|
|
|
|
|
|
|
return bb3;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This function generates the dispatch function for
|
|
|
|
multi-versioned functions. DISPATCH_DECL is the function which will
|
|
|
|
contain the dispatch logic. FNDECLS are the function choices for
|
|
|
|
dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
|
|
|
|
in DISPATCH_DECL in which the dispatch code is generated. */
|
|
|
|
|
|
|
|
static int
|
|
|
|
dispatch_function_versions (tree dispatch_decl,
|
|
|
|
void *fndecls_p,
|
|
|
|
basic_block *empty_bb)
|
|
|
|
{
|
|
|
|
tree default_decl;
|
|
|
|
gimple *ifunc_cpu_init_stmt;
|
|
|
|
gimple_seq gseq;
|
|
|
|
int ix;
|
|
|
|
tree ele;
|
|
|
|
vec<tree> *fndecls;
|
|
|
|
unsigned int num_versions = 0;
|
|
|
|
unsigned int actual_versions = 0;
|
|
|
|
unsigned int i;
|
|
|
|
|
|
|
|
struct _function_version_info
|
|
|
|
{
|
|
|
|
tree version_decl;
|
|
|
|
tree predicate_chain;
|
|
|
|
unsigned int dispatch_priority;
|
|
|
|
}*function_version_info;
|
|
|
|
|
|
|
|
gcc_assert (dispatch_decl != NULL
|
|
|
|
&& fndecls_p != NULL
|
|
|
|
&& empty_bb != NULL);
|
|
|
|
|
|
|
|
/*fndecls_p is actually a vector. */
|
|
|
|
fndecls = static_cast<vec<tree> *> (fndecls_p);
|
|
|
|
|
|
|
|
/* At least one more version other than the default. */
|
|
|
|
num_versions = fndecls->length ();
|
|
|
|
gcc_assert (num_versions >= 2);
|
|
|
|
|
|
|
|
function_version_info = (struct _function_version_info *)
|
|
|
|
XNEWVEC (struct _function_version_info, (num_versions - 1));
|
|
|
|
|
|
|
|
/* The first version in the vector is the default decl. */
|
|
|
|
default_decl = (*fndecls)[0];
|
|
|
|
|
|
|
|
push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
|
|
|
|
|
|
|
|
gseq = bb_seq (*empty_bb);
|
|
|
|
/* Function version dispatch is via IFUNC. IFUNC resolvers fire before
|
|
|
|
constructors, so explicity call __builtin_cpu_init here. */
|
|
|
|
ifunc_cpu_init_stmt
|
|
|
|
= gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
|
|
|
|
gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
|
|
|
|
gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
|
|
|
|
set_bb_seq (*empty_bb, gseq);
|
|
|
|
|
|
|
|
pop_cfun ();
|
|
|
|
|
|
|
|
|
|
|
|
for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
|
|
|
|
{
|
|
|
|
tree version_decl = ele;
|
|
|
|
tree predicate_chain = NULL_TREE;
|
|
|
|
unsigned int priority;
|
|
|
|
/* Get attribute string, parse it and find the right predicate decl.
|
|
|
|
The predicate function could be a lengthy combination of many
|
|
|
|
features, like arch-type and various isa-variants. */
|
|
|
|
priority = get_builtin_code_for_version (version_decl,
|
|
|
|
&predicate_chain);
|
|
|
|
|
|
|
|
if (predicate_chain == NULL_TREE)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
function_version_info [actual_versions].version_decl = version_decl;
|
|
|
|
function_version_info [actual_versions].predicate_chain
|
|
|
|
= predicate_chain;
|
|
|
|
function_version_info [actual_versions].dispatch_priority = priority;
|
|
|
|
actual_versions++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Sort the versions according to descending order of dispatch priority. The
|
|
|
|
priority is based on the ISA. This is not a perfect solution. There
|
|
|
|
could still be ambiguity. If more than one function version is suitable
|
|
|
|
to execute, which one should be dispatched? In future, allow the user
|
|
|
|
to specify a dispatch priority next to the version. */
|
|
|
|
qsort (function_version_info, actual_versions,
|
|
|
|
sizeof (struct _function_version_info), feature_compare);
|
|
|
|
|
|
|
|
for (i = 0; i < actual_versions; ++i)
|
|
|
|
*empty_bb = add_condition_to_bb (dispatch_decl,
|
|
|
|
function_version_info[i].version_decl,
|
|
|
|
function_version_info[i].predicate_chain,
|
|
|
|
*empty_bb);
|
|
|
|
|
|
|
|
/* dispatch default version at the end. */
|
|
|
|
*empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
|
|
|
|
NULL, *empty_bb);
|
|
|
|
|
|
|
|
free (function_version_info);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This function changes the assembler name for functions that are
|
|
|
|
versions. If DECL is a function version and has a "target"
|
|
|
|
attribute, it appends the attribute string to its assembler name. */
|
|
|
|
|
|
|
|
static tree
|
|
|
|
ix86_mangle_function_version_assembler_name (tree decl, tree id)
|
|
|
|
{
|
|
|
|
tree version_attr;
|
|
|
|
const char *orig_name, *version_string;
|
|
|
|
char *attr_str, *assembler_name;
|
|
|
|
|
|
|
|
if (DECL_DECLARED_INLINE_P (decl)
|
|
|
|
&& lookup_attribute ("gnu_inline",
|
|
|
|
DECL_ATTRIBUTES (decl)))
|
|
|
|
error_at (DECL_SOURCE_LOCATION (decl),
|
trans.c (check_inlining_for_nested_subprog): Quote reserved names.
gcc/ada/ChangeLog:
* gcc-interface/trans.c (check_inlining_for_nested_subprog): Quote
reserved names.
gcc/brig/ChangeLog:
* brigfrontend/brig-control-handler.cc
(brig_directive_control_handler::operator): Remove trailing newline
from a diagnostic.
* brigfrontend/brig-module-handler.cc
(brig_directive_module_handler::operator): Remove a duplicated space
from a diagnostic.
gcc/c/ChangeLog:
* c-decl.c (start_decl): Quote keywords, operators, and
types in diagnostics.
(finish_decl): Same.
* c-parser.c (c_parser_asm_statement): Same.
(c_parser_conditional_expression): Same.
(c_parser_transaction_cancel): Same.
* c-typeck.c (c_common_type): Same.
(build_conditional_expr): Same.
(digest_init): Same.
(process_init_element): Same.
(build_binary_op): Same.
gcc/c-family/ChangeLog:
* c-attribs.c (handle_no_sanitize_attribute): Quote identifiers,
keywords, operators, and types in diagnostics.
(handle_scalar_storage_order_attribute): Same.
(handle_mode_attribute): Same.
(handle_visibility_attribute): Same.
(handle_assume_aligned_attribute): Same.
(handle_no_split_stack_attribute): Same.
* c-common.c (shorten_compare): Same.
(c_common_truthvalue_conversion): Same.
(cb_get_source_date_epoch): Same.
* c-lex.c (cb_def_pragma): Quote keywords, operators, and types
in diagnostics.
(interpret_float): Same.
* c-omp.c (c_finish_omp_for): Same.
* c-opts.c (c_common_post_options): Same.
* c-pch.c (c_common_pch_pragma): Same.
* c-pragma.c (pop_alignment): Same.
(handle_pragma_pack): Same.
(apply_pragma_weak): Same.
(handle_pragma_weak): Same.
(handle_pragma_scalar_storage_order): Same.
(handle_pragma_redefine_extname): Same.
(add_to_renaming_pragma_list): Same.
(maybe_apply_renaming_pragma): Same.
(push_visibility): Same.
(handle_pragma_visibility): Same.
(handle_pragma_optimize): Same.
(handle_pragma_message): Same.
* c-warn.c (warn_for_omitted_condop): Same.
(lvalue_error): Same.
gcc/cp/ChangeLog:
* call.c (print_z_candidate): Wrap diagnostic text in a gettext
macro. Adjust.
(print_z_candidates): Same.
(build_conditional_expr_1): Quote keywords, operators, and types
in diagnostics.
(build_op_delete_call): Same.
(maybe_print_user_conv_context): Wrap diagnostic text in a gettext
macro.
(convert_like_real): Same.
(convert_arg_to_ellipsis): Quote keywords, operators, and types
in diagnostics.
(build_over_call): Same.
(joust): Break up an overlong line. Wrap diagnostic text in a gettext
macro.
* constexpr.c (cxx_eval_check_shift_p): Spell out >= in English.
(cxx_eval_constant_expression): Quote keywords, operators, and types
in diagnostics.
(potential_constant_expression_1): Same.
* cp-gimplify.c (cp_genericize_r): Same.
* cvt.c (maybe_warn_nodiscard): Quote keywords, operators, and types
in diagnostics.
(type_promotes_to): Same.
* decl.c (check_previous_goto_1): Same.
(check_goto): Same.
(start_decl): Same.
(cp_finish_decl): Avoid parenthesizing a sentence for consistency.
(grok_op_properties): Quote keywords, operators, and types
in diagnostics.
* decl2.c (grokfield): Same.
(coerce_delete_type): Same.
* except.c (is_admissible_throw_operand_or_catch_parameter): Same.
* friend.c (do_friend): Quote C++ tokens.
* init.c (build_new_1): Quote keywords, operators, and types
in diagnostics.
(build_vec_delete_1): Same.
(build_delete): Same.
* lex.c (parse_strconst_pragma): Same.
(handle_pragma_implementation): Same.
(unqualified_fn_lookup_error): Same.
* mangle.c (write_type): Same.
* method.c (defaulted_late_check): Avoid two consecutive punctuators.
* name-lookup.c (cp_binding_level_debug): Remove a trailing newline.
(pop_everything): Same.
* parser.c (cp_lexer_start_debugging): Quote a macro name.
in a diagnostic
(cp_lexer_stop_debugging): Same.
(cp_parser_userdef_numeric_literal): Quote a C++ header name
in a diagnostic.
(cp_parser_nested_name_specifier_opt): Quote keywords, operators,
and types in diagnostics.
(cp_parser_question_colon_clause): Same.
(cp_parser_asm_definition): Same.
(cp_parser_init_declarator): Same.
(cp_parser_template_declaration_after_parameters): Avoid capitalizing
a sentence in a diagnostic.
(cp_parser_omp_declare_reduction): Quote keywords, operators, and types
in diagnostics.
(cp_parser_transaction): Same.
* pt.c (maybe_process_partial_specialization): Replace second call
to permerror with inform for consistency with other uses.
(expand_integer_pack): Quote keywords, operators, and types
in diagnostics.
* rtti.c (get_typeid): Quote keywords, operators, and types
in diagnostics.
(build_dynamic_cast_1): Same.
* semantics.c (finish_asm_stmt): Same.
(finish_label_decl): Same.
(finish_bases): Same.
(finish_offsetof): Same.
(cp_check_omp_declare_reduction): Same.
(finish_decltype_type): Same.
* tree.c (handle_init_priority_attribute): Same. Add detail
to diagnostics.
(maybe_warn_zero_as_null_pointer_constant): Same.
* typeck.c (cp_build_binary_op): Quote keywords, operators, and types
in diagnostics.
(cp_build_unary_op): Same.
(check_for_casting_away_constness): Same.
(build_static_cast): Same.
(build_const_cast_1): Same.
(maybe_warn_about_returning_address_of_local): Same.
(check_return_expr): Same.
* typeck2.c (abstract_virtuals_error_sfinae): Same.
(digest_init_r): Replace a tab with spaces in a diagnostic.
(build_functional_cast): Quote keywords, operators, and types
in diagnostics.
gcc/d/ChangeLog:
* d-builtins.cc (d_init_builtins): Quote keywords, operators,
and types in diagnostics.
* d-codegen.cc (get_array_length): Same. Replace can't with cannot.
* d-convert.cc (convert_expr): Same.
* d-frontend.cc (getTypeInfoType): Quote an option name in
a diagnostic.
* d-lang.cc (d_handle_option): Same.
(d_parse_file): Same.
* decl.cc: Remove a trailing period from a diagnostic.
* expr.cc: Use a directive for an apostrophe.
* toir.cc: Quote keywords, operators, and types in diagnostics.
* typeinfo.cc (build_typeinfo): Quote an option name in a diagnostic.
gcc/fortran/ChangeLog:
* gfortranspec.c (append_arg): Spell out the word "argument."
gcc/ChangeLog:
* config/i386/i386-expand.c (get_element_number): Quote keywords
and other internal names in diagnostics. Adjust other diagnostic
formatting issues noted by -Wformat-diag.
* config/i386/i386-features.c
(ix86_mangle_function_version_assembler_name): Same.
* config/i386/i386-options.c (ix86_handle_abi_attribute): Same.
* config/i386/i386.c (ix86_function_type_abi): Same.
(ix86_function_ms_hook_prologue): Same.
(classify_argument): Same.
(ix86_expand_prologue): Same.
(ix86_md_asm_adjust): Same.
(ix86_memmodel_check): Same.
gcc/ChangeLog:
* builtins.c (expand_builtin_atomic_always_lock_free): Quote
identifiers, keywords, operators, and types in diagnostics. Correct
quoting, spelling, and sentence capitalization issues.
(expand_builtin_atomic_is_lock_free): Same.
(fold_builtin_next_arg): Same.
* cfgexpand.c (expand_one_var): Same.
(tree_conflicts_with_clobbers_p): Same.
(expand_asm_stmt): Same.
(verify_loop_structure): Same.
* cgraphunit.c (process_function_and_variable_attributes): Same.
* collect-utils.c (collect_execute): Same.
* collect2.c (maybe_run_lto_and_relink): Same.
(is_lto_object_file): Same.
(scan_prog_file): Same.
* convert.c (convert_to_real_1): Same.
* dwarf2out.c (dwarf2out_begin_prologue): Same.
* except.c (verify_eh_tree): Same.
* gcc.c (execute): Same.
(eval_spec_function): Same.
(run_attempt): Same.
(driver::set_up_specs): Same.
(compare_debug_auxbase_opt_spec_function): Same.
* gcov-tool.c (unlink_gcda_file): Same.
(do_merge): Same.
(do_rewrite): Same.
* gcse.c (gcse_or_cprop_is_too_expensive): Same.
* gimplify.c (gimplify_asm_expr): Same.
(gimplify_adjust_omp_clauses): Same.
* hsa-gen.c (gen_hsa_addr_insns): Same.
(gen_hsa_insns_for_load): Same.
(gen_hsa_cmp_insn_from_gimple): Same.
(gen_hsa_insns_for_operation_assignment): Same.
(gen_get_level): Same.
(gen_hsa_alloca): Same.
(omp_simple_builtin::generate): Same.
(gen_hsa_atomic_for_builtin): Same.
(gen_hsa_insns_for_call): Same.
* input.c (dump_location_info): Same.
* ipa-devirt.c (compare_virtual_tables): Same.
* ira.c (ira_setup_eliminable_regset): Same.
* lra-assigns.c (lra_assign): Same.
* lra-constraints.c (lra_constraints): Same.
* lto-streamer-in.c (lto_input_mode_table): Same.
* lto-wrapper.c (get_options_from_collect_gcc_options): Same.
(merge_and_complain): Same.
(compile_offload_image): Same.
(compile_images_for_offload_targets): Same.
(debug_objcopy): Same.
(run_gcc): Same.
(main): Same.
* opts.c (print_specific_help): Same.
(parse_no_sanitize_attribute): Same.
(print_help): Same.
(handle_param): Same.
* plugin.c (add_new_plugin): Same.
(parse_plugin_arg_opt): Same.
(try_init_one_plugin): Same.
* print-rtl.c (debug_bb_n_slim): Quote identifiers, keywords,
operators, and types in diagnostics. Correct quoting and spelling
issues.
* read-rtl-function.c (parse_edge_flag_token): Same.
(function_reader::parse_enum_value): Same.
* reg-stack.c (check_asm_stack_operands): Same.
* regcprop.c (validate_value_data): Same.
* sched-rgn.c (make_pass_sched_fusion): Same.
* stmt.c (check_unique_operand_names): Same.
* targhooks.c (default_target_option_pragma_parse): Same.
* tlink.c (recompile_files): Same.
* toplev.c (process_options): Same.
(do_compile): Same.
* trans-mem.c (diagnose_tm_1): Same.
(ipa_tm_scan_irr_block): Same.
(ipa_tm_diagnose_transaction): Same.
* tree-cfg.c (verify_address): Same. Use get_tree_code_name to
format a tree code name in a diagnostic.
(verify_types_in_gimple_min_lval): Same.
(verify_types_in_gimple_reference): Same.
(verify_gimple_call): Same.
(verify_gimple_assign_unary): Same.
(verify_gimple_assign_binary): Same.
(verify_gimple_assign_ternary): Same.
(verify_gimple_assign_single): Same.
(verify_gimple_switch): Same.
(verify_gimple_label): Same.
(verify_gimple_phi): Same.
(verify_gimple_in_seq): Same.
(verify_eh_throw_stmt_node): Same.
(collect_subblocks): Same.
(gimple_verify_flow_info): Same.
(do_warn_unused_result): Same.
* tree-inline.c (expand_call_inline): Same.
* tree-into-ssa.c (update_ssa): Same.
* tree.c (tree_int_cst_elt_check_failed): Same.
(tree_vec_elt_check_failed): Same.
(omp_clause_operand_check_failed): Same.
(verify_type_variant): Same.
(verify_type): Same.
* value-prof.c (verify_histograms): Same.
* varasm.c (assemble_start_function): Same.
gcc/lto/ChangeLog:
* lto-dump.c (lto_main): Same.
* lto.c (stream_out): Same.
gcc/objc/ChangeLog:
* objc-act.c (objc_begin_catch_clause): Quote keywords and options
in diagnostics.
(objc_build_throw_stmt): Same.
(objc_finish_message_expr): Same.
(get_super_receiver): Same.
* objc-next-runtime-abi-01.c (objc_next_runtime_abi_01_init): Spell
out "less than" in English./
* objc-next-runtime-abi-02.c (objc_next_runtime_abi_02_init): Spell
out "greater" in English.
gcc/testsuite/ChangeLog:
* c-c++-common/Wbool-operation-1.c: Adjust text of expected diagnostics.
* c-c++-common/Wvarargs-2.c: Same.
* c-c++-common/Wvarargs.c: Same.
* c-c++-common/pr51768.c: Same.
* c-c++-common/tm/inline-asm.c: Same.
* c-c++-common/tm/safe-1.c: Same.
* g++.dg/asm-qual-1.C: Same.
* g++.dg/asm-qual-3.C: Same.
* g++.dg/conversion/dynamic1.C: Same.
* g++.dg/cpp0x/constexpr-89599.C: Same.
* g++.dg/cpp0x/constexpr-cast.C: Same.
* g++.dg/cpp0x/constexpr-shift1.C: Same.
* g++.dg/cpp0x/lambda/lambda-conv11.C: Same.
* g++.dg/cpp0x/nullptr04.C: Same.
* g++.dg/cpp0x/static_assert12.C: Same.
* g++.dg/cpp0x/static_assert8.C: Same.
* g++.dg/cpp1y/lambda-conv1.C: Same.
* g++.dg/cpp1y/pr79393-3.C: Same.
* g++.dg/cpp1y/static_assert1.C: Same.
* g++.dg/cpp1z/constexpr-if4.C: Same.
* g++.dg/cpp1z/constexpr-if5.C: Same.
* g++.dg/cpp1z/constexpr-if9.C: Same.
* g++.dg/eh/goto2.C: Same.
* g++.dg/eh/goto3.C: Same.
* g++.dg/expr/static_cast8.C: Same.
* g++.dg/ext/flexary5.C: Same.
* g++.dg/ext/utf-array-short-wchar.C: Same.
* g++.dg/ext/utf-array.C: Same.
* g++.dg/ext/utf8-2.C: Same.
* g++.dg/gomp/loop-4.C: Same.
* g++.dg/gomp/macro-4.C: Same.
* g++.dg/gomp/udr-1.C: Same.
* g++.dg/init/initializer-string-too-long.C: Same.
* g++.dg/other/offsetof9.C: Same.
* g++.dg/ubsan/pr63956.C: Same.
* g++.dg/warn/Wbool-operation-1.C: Same.
* g++.dg/warn/Wtype-limits-Wextra.C: Same.
* g++.dg/warn/Wtype-limits.C: Same.
* g++.dg/wrappers/pr88680.C: Same.
* g++.old-deja/g++.mike/eh55.C: Same.
* gcc.dg/Wsign-compare-1.c: Same.
* gcc.dg/Wtype-limits-Wextra.c: Same.
* gcc.dg/Wtype-limits.c: Same.
* gcc.dg/Wunknownprag.c: Same.
* gcc.dg/Wunsuffixed-float-constants-1.c: Same.
* gcc.dg/asm-6.c: Same.
* gcc.dg/asm-qual-1.c: Same.
* gcc.dg/cast-1.c: Same.
* gcc.dg/cast-2.c: Same.
* gcc.dg/cast-3.c: Same.
* gcc.dg/cpp/source_date_epoch-2.c: Same.
* gcc.dg/debug/pr85252.c: Same.
* gcc.dg/dfp/cast-bad.c: Same.
* gcc.dg/format/gcc_diag-1.c: Same.
* gcc.dg/format/gcc_diag-11.c: Same.New test.
* gcc.dg/gcc_diag-11.c: Same.New test.
* gcc.dg/gnu-cond-expr-2.c: Same.
* gcc.dg/gnu-cond-expr-3.c: Same.
* gcc.dg/gomp/macro-4.c: Same.
* gcc.dg/init-bad-1.c: Same.
* gcc.dg/init-bad-2.c: Same.
* gcc.dg/init-bad-3.c: Same.
* gcc.dg/pr27528.c: Same.
* gcc.dg/pr48552-1.c: Same.
* gcc.dg/pr48552-2.c: Same.
* gcc.dg/pr59846.c: Same.
* gcc.dg/pr61096-1.c: Same.
* gcc.dg/pr8788-1.c: Same.
* gcc.dg/pr90082.c: Same.
* gcc.dg/simd-2.c: Same.
* gcc.dg/spellcheck-params-2.c: Same.
* gcc.dg/spellcheck-params.c: Same.
* gcc.dg/strlenopt-49.c: Same.
* gcc.dg/tm/pr52141.c: Same.
* gcc.dg/torture/pr51106-1.c: Same.
* gcc.dg/torture/pr51106-2.c: Same.
* gcc.dg/utf-array-short-wchar.c: Same.
* gcc.dg/utf-array.c: Same.
* gcc.dg/utf8-2.c: Same.
* gcc.dg/warn-sprintf-no-nul.c: Same.
* gcc.target/i386/asm-flag-0.c: Same.
* gcc.target/i386/inline_error.c: Same.
* gcc.target/i386/pr30848.c: Same.
* gcc.target/i386/pr39082-1.c: Same.
* gcc.target/i386/pr39678.c: Same.
* gcc.target/i386/pr57756.c: Same.
* gcc.target/i386/pr68843-1.c: Same.
* gcc.target/i386/pr79804.c: Same.
* gcc.target/i386/pr82673.c: Same.
* obj-c++.dg/class-protocol-1.mm: Same.
* obj-c++.dg/exceptions-3.mm: Same.
* obj-c++.dg/exceptions-4.mm: Same.
* obj-c++.dg/exceptions-5.mm: Same.
* obj-c++.dg/exceptions-6.mm: Same.
* obj-c++.dg/method-12.mm: Same.
* obj-c++.dg/method-13.mm: Same.
* obj-c++.dg/method-6.mm: Same.
* obj-c++.dg/method-7.mm: Same.
* obj-c++.dg/method-9.mm: Same.
* obj-c++.dg/method-lookup-1.mm: Same.
* obj-c++.dg/proto-lossage-4.mm: Same.
* obj-c++.dg/protocol-qualifier-2.mm: Same.
* objc.dg/call-super-2.m: Same.
* objc.dg/class-protocol-1.m: Same.
* objc.dg/desig-init-1.m: Same.
* objc.dg/exceptions-3.m: Same.
* objc.dg/exceptions-4.m: Same.
* objc.dg/exceptions-5.m: Same.
* objc.dg/exceptions-6.m: Same.
* objc.dg/method-19.m: Same.
* objc.dg/method-2.m: Same.
* objc.dg/method-5.m: Same.
* objc.dg/method-6.m: Same.
* objc.dg/method-7.m: Same.
* objc.dg/method-lookup-1.m: Same.
* objc.dg/proto-hier-1.m: Same.
* objc.dg/proto-lossage-4.m: Same.
From-SVN: r271338
2019-05-17 19:55:43 +02:00
|
|
|
"function versions cannot be marked as %<gnu_inline%>,"
|
2019-05-06 09:18:26 +02:00
|
|
|
" bodies have to be generated");
|
|
|
|
|
|
|
|
if (DECL_VIRTUAL_P (decl)
|
|
|
|
|| DECL_VINDEX (decl))
|
|
|
|
sorry ("virtual function multiversioning not supported");
|
|
|
|
|
|
|
|
version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
|
|
|
|
|
|
|
|
/* target attribute string cannot be NULL. */
|
|
|
|
gcc_assert (version_attr != NULL_TREE);
|
|
|
|
|
|
|
|
orig_name = IDENTIFIER_POINTER (id);
|
|
|
|
version_string
|
|
|
|
= TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
|
|
|
|
|
|
|
|
if (strcmp (version_string, "default") == 0)
|
|
|
|
return id;
|
|
|
|
|
|
|
|
attr_str = sorted_attr_string (TREE_VALUE (version_attr));
|
|
|
|
assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
|
|
|
|
|
|
|
|
sprintf (assembler_name, "%s.%s", orig_name, attr_str);
|
|
|
|
|
|
|
|
/* Allow assembler name to be modified if already set. */
|
|
|
|
if (DECL_ASSEMBLER_NAME_SET_P (decl))
|
|
|
|
SET_DECL_RTL (decl, NULL);
|
|
|
|
|
|
|
|
tree ret = get_identifier (assembler_name);
|
|
|
|
XDELETEVEC (attr_str);
|
|
|
|
XDELETEVEC (assembler_name);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
tree
|
|
|
|
ix86_mangle_decl_assembler_name (tree decl, tree id)
|
|
|
|
{
|
|
|
|
/* For function version, add the target suffix to the assembler name. */
|
|
|
|
if (TREE_CODE (decl) == FUNCTION_DECL
|
|
|
|
&& DECL_FUNCTION_VERSIONED (decl))
|
|
|
|
id = ix86_mangle_function_version_assembler_name (decl, id);
|
|
|
|
#ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
|
|
|
|
id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return id;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Make a dispatcher declaration for the multi-versioned function DECL.
|
|
|
|
Calls to DECL function will be replaced with calls to the dispatcher
|
|
|
|
by the front-end. Returns the decl of the dispatcher function. */
|
|
|
|
|
|
|
|
tree
|
|
|
|
ix86_get_function_versions_dispatcher (void *decl)
|
|
|
|
{
|
|
|
|
tree fn = (tree) decl;
|
|
|
|
struct cgraph_node *node = NULL;
|
|
|
|
struct cgraph_node *default_node = NULL;
|
|
|
|
struct cgraph_function_version_info *node_v = NULL;
|
|
|
|
struct cgraph_function_version_info *first_v = NULL;
|
|
|
|
|
|
|
|
tree dispatch_decl = NULL;
|
|
|
|
|
|
|
|
struct cgraph_function_version_info *default_version_info = NULL;
|
|
|
|
|
|
|
|
gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
|
|
|
|
|
|
|
|
node = cgraph_node::get (fn);
|
|
|
|
gcc_assert (node != NULL);
|
|
|
|
|
|
|
|
node_v = node->function_version ();
|
|
|
|
gcc_assert (node_v != NULL);
|
|
|
|
|
|
|
|
if (node_v->dispatcher_resolver != NULL)
|
|
|
|
return node_v->dispatcher_resolver;
|
|
|
|
|
|
|
|
/* Find the default version and make it the first node. */
|
|
|
|
first_v = node_v;
|
|
|
|
/* Go to the beginning of the chain. */
|
|
|
|
while (first_v->prev != NULL)
|
|
|
|
first_v = first_v->prev;
|
|
|
|
default_version_info = first_v;
|
|
|
|
while (default_version_info != NULL)
|
|
|
|
{
|
|
|
|
if (is_function_default_version
|
|
|
|
(default_version_info->this_node->decl))
|
|
|
|
break;
|
|
|
|
default_version_info = default_version_info->next;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If there is no default node, just return NULL. */
|
|
|
|
if (default_version_info == NULL)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* Make default info the first node. */
|
|
|
|
if (first_v != default_version_info)
|
|
|
|
{
|
|
|
|
default_version_info->prev->next = default_version_info->next;
|
|
|
|
if (default_version_info->next)
|
|
|
|
default_version_info->next->prev = default_version_info->prev;
|
|
|
|
first_v->prev = default_version_info;
|
|
|
|
default_version_info->next = first_v;
|
|
|
|
default_version_info->prev = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
default_node = default_version_info->this_node;
|
|
|
|
|
|
|
|
#if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
|
|
|
|
if (targetm.has_ifunc_p ())
|
|
|
|
{
|
|
|
|
struct cgraph_function_version_info *it_v = NULL;
|
|
|
|
struct cgraph_node *dispatcher_node = NULL;
|
|
|
|
struct cgraph_function_version_info *dispatcher_version_info = NULL;
|
|
|
|
|
|
|
|
/* Right now, the dispatching is done via ifunc. */
|
|
|
|
dispatch_decl = make_dispatcher_decl (default_node->decl);
|
|
|
|
|
|
|
|
dispatcher_node = cgraph_node::get_create (dispatch_decl);
|
|
|
|
gcc_assert (dispatcher_node != NULL);
|
|
|
|
dispatcher_node->dispatcher_function = 1;
|
|
|
|
dispatcher_version_info
|
|
|
|
= dispatcher_node->insert_new_function_version ();
|
|
|
|
dispatcher_version_info->next = default_version_info;
|
|
|
|
dispatcher_node->definition = 1;
|
|
|
|
|
|
|
|
/* Set the dispatcher for all the versions. */
|
|
|
|
it_v = default_version_info;
|
|
|
|
while (it_v != NULL)
|
|
|
|
{
|
|
|
|
it_v->dispatcher_resolver = dispatch_decl;
|
|
|
|
it_v = it_v->next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
error_at (DECL_SOURCE_LOCATION (default_node->decl),
|
2019-06-05 20:30:48 +02:00
|
|
|
"multiversioning needs %<ifunc%> which is not supported "
|
2019-05-06 09:18:26 +02:00
|
|
|
"on this target");
|
|
|
|
}
|
|
|
|
|
|
|
|
return dispatch_decl;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Make the resolver function decl to dispatch the versions of
|
|
|
|
a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
|
|
|
|
ifunc alias that will point to the created resolver. Create an
|
|
|
|
empty basic block in the resolver and store the pointer in
|
|
|
|
EMPTY_BB. Return the decl of the resolver function. */
|
|
|
|
|
|
|
|
static tree
|
|
|
|
make_resolver_func (const tree default_decl,
|
|
|
|
const tree ifunc_alias_decl,
|
|
|
|
basic_block *empty_bb)
|
|
|
|
{
|
2020-01-27 10:48:18 +01:00
|
|
|
tree decl, type, t;
|
2019-05-06 09:18:26 +02:00
|
|
|
|
2020-01-27 10:48:18 +01:00
|
|
|
/* Create resolver function name based on default_decl. */
|
|
|
|
tree decl_name = clone_function_name (default_decl, "resolver");
|
|
|
|
const char *resolver_name = IDENTIFIER_POINTER (decl_name);
|
2019-05-06 09:18:26 +02:00
|
|
|
|
|
|
|
/* The resolver function should return a (void *). */
|
|
|
|
type = build_function_type_list (ptr_type_node, NULL_TREE);
|
|
|
|
|
|
|
|
decl = build_fn_decl (resolver_name, type);
|
|
|
|
SET_DECL_ASSEMBLER_NAME (decl, decl_name);
|
|
|
|
|
|
|
|
DECL_NAME (decl) = decl_name;
|
|
|
|
TREE_USED (decl) = 1;
|
|
|
|
DECL_ARTIFICIAL (decl) = 1;
|
|
|
|
DECL_IGNORED_P (decl) = 1;
|
|
|
|
TREE_PUBLIC (decl) = 0;
|
|
|
|
DECL_UNINLINABLE (decl) = 1;
|
|
|
|
|
|
|
|
/* Resolver is not external, body is generated. */
|
|
|
|
DECL_EXTERNAL (decl) = 0;
|
|
|
|
DECL_EXTERNAL (ifunc_alias_decl) = 0;
|
|
|
|
|
|
|
|
DECL_CONTEXT (decl) = NULL_TREE;
|
|
|
|
DECL_INITIAL (decl) = make_node (BLOCK);
|
|
|
|
DECL_STATIC_CONSTRUCTOR (decl) = 0;
|
|
|
|
|
|
|
|
if (DECL_COMDAT_GROUP (default_decl)
|
|
|
|
|| TREE_PUBLIC (default_decl))
|
|
|
|
{
|
|
|
|
/* In this case, each translation unit with a call to this
|
|
|
|
versioned function will put out a resolver. Ensure it
|
|
|
|
is comdat to keep just one copy. */
|
|
|
|
DECL_COMDAT (decl) = 1;
|
|
|
|
make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
|
|
|
|
}
|
2020-03-25 11:03:39 +01:00
|
|
|
else
|
|
|
|
TREE_PUBLIC (ifunc_alias_decl) = 0;
|
|
|
|
|
2019-05-06 09:18:26 +02:00
|
|
|
/* Build result decl and add to function_decl. */
|
|
|
|
t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
|
|
|
|
DECL_CONTEXT (t) = decl;
|
|
|
|
DECL_ARTIFICIAL (t) = 1;
|
|
|
|
DECL_IGNORED_P (t) = 1;
|
|
|
|
DECL_RESULT (decl) = t;
|
|
|
|
|
|
|
|
gimplify_function_tree (decl);
|
|
|
|
push_cfun (DECL_STRUCT_FUNCTION (decl));
|
|
|
|
*empty_bb = init_lowered_empty_function (decl, false,
|
|
|
|
profile_count::uninitialized ());
|
|
|
|
|
|
|
|
cgraph_node::add_new_function (decl, true);
|
|
|
|
symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
|
|
|
|
|
|
|
|
pop_cfun ();
|
|
|
|
|
|
|
|
gcc_assert (ifunc_alias_decl != NULL);
|
|
|
|
/* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
|
|
|
|
DECL_ATTRIBUTES (ifunc_alias_decl)
|
|
|
|
= make_attribute ("ifunc", resolver_name,
|
|
|
|
DECL_ATTRIBUTES (ifunc_alias_decl));
|
|
|
|
|
|
|
|
/* Create the alias for dispatch to resolver here. */
|
|
|
|
cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
|
|
|
|
return decl;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Generate the dispatching code body to dispatch multi-versioned function
|
|
|
|
DECL. The target hook is called to process the "target" attributes and
|
|
|
|
provide the code to dispatch the right function at run-time. NODE points
|
|
|
|
to the dispatcher decl whose body will be created. */
|
|
|
|
|
|
|
|
tree
|
|
|
|
ix86_generate_version_dispatcher_body (void *node_p)
|
|
|
|
{
|
|
|
|
tree resolver_decl;
|
|
|
|
basic_block empty_bb;
|
|
|
|
tree default_ver_decl;
|
|
|
|
struct cgraph_node *versn;
|
|
|
|
struct cgraph_node *node;
|
|
|
|
|
|
|
|
struct cgraph_function_version_info *node_version_info = NULL;
|
|
|
|
struct cgraph_function_version_info *versn_info = NULL;
|
|
|
|
|
|
|
|
node = (cgraph_node *)node_p;
|
|
|
|
|
|
|
|
node_version_info = node->function_version ();
|
|
|
|
gcc_assert (node->dispatcher_function
|
|
|
|
&& node_version_info != NULL);
|
|
|
|
|
|
|
|
if (node_version_info->dispatcher_resolver)
|
|
|
|
return node_version_info->dispatcher_resolver;
|
|
|
|
|
|
|
|
/* The first version in the chain corresponds to the default version. */
|
|
|
|
default_ver_decl = node_version_info->next->this_node->decl;
|
|
|
|
|
|
|
|
/* node is going to be an alias, so remove the finalized bit. */
|
|
|
|
node->definition = false;
|
|
|
|
|
|
|
|
resolver_decl = make_resolver_func (default_ver_decl,
|
|
|
|
node->decl, &empty_bb);
|
|
|
|
|
|
|
|
node_version_info->dispatcher_resolver = resolver_decl;
|
|
|
|
|
|
|
|
push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
|
|
|
|
|
|
|
|
auto_vec<tree, 2> fn_ver_vec;
|
|
|
|
|
|
|
|
for (versn_info = node_version_info->next; versn_info;
|
|
|
|
versn_info = versn_info->next)
|
|
|
|
{
|
|
|
|
versn = versn_info->this_node;
|
|
|
|
/* Check for virtual functions here again, as by this time it should
|
|
|
|
have been determined if this function needs a vtable index or
|
|
|
|
not. This happens for methods in derived classes that override
|
|
|
|
virtual methods in base classes but are not explicitly marked as
|
|
|
|
virtual. */
|
|
|
|
if (DECL_VINDEX (versn->decl))
|
|
|
|
sorry ("virtual function multiversioning not supported");
|
|
|
|
|
|
|
|
fn_ver_vec.safe_push (versn->decl);
|
|
|
|
}
|
|
|
|
|
|
|
|
dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
|
|
|
|
cgraph_edge::rebuild_edges ();
|
|
|
|
pop_cfun ();
|
|
|
|
return resolver_decl;
|
|
|
|
}
|
|
|
|
|
|
|
|
|