arm.c (output_move_neon): Update comment describing big-endian vector layout.

* config/arm/arm.c (output_move_neon): Update comment describing
	big-endian vector layout.
	(arm_assemble_integer): Do not handle big-endian NEON vectors
	specially.
	* config/arm/neon.md (vec_set<mode>_internal, vec_extract<mode>,
	neon_vget_lane<mode>_sext_internal,
	neon_vget_lane<mode>_zext_internal, neon_vget_lane<mode>): Adjust
	element indices for big-endian.

From-SVN: r138847
This commit is contained in:
Joseph Myers 2008-08-07 17:58:29 +01:00 committed by Joseph Myers
parent 058514b381
commit 874d42b93e
3 changed files with 88 additions and 36 deletions

View File

@ -1,3 +1,14 @@
2008-08-07 Joseph Myers <joseph@codesourcery.com>
* config/arm/arm.c (output_move_neon): Update comment describing
big-endian vector layout.
(arm_assemble_integer): Do not handle big-endian NEON vectors
specially.
* config/arm/neon.md (vec_set<mode>_internal, vec_extract<mode>,
neon_vget_lane<mode>_sext_internal,
neon_vget_lane<mode>_zext_internal, neon_vget_lane<mode>): Adjust
element indices for big-endian.
2008-08-07 Richard Henderson <rth@redhat.com>
* configure.ac (HAVE_GAS_CFI_PERSONALITY_DIRECTIVE): New.

View File

@ -10335,30 +10335,28 @@ output_move_vfp (rtx *operands)
}
/* Output a Neon quad-word load or store, or a load or store for
larger structure modes. We could also support post-modify forms using
VLD1/VST1 (for the vectorizer, and perhaps otherwise), but we don't do that
yet.
WARNING: The ordering of elements in memory is weird in big-endian mode,
because we use VSTM instead of VST1, to make it easy to make vector stores
via ARM registers write values in the same order as stores direct from Neon
registers. For example, the byte ordering of a quadword vector with 16-byte
elements like this:
larger structure modes.
[e7:e6:e5:e4:e3:e2:e1:e0] (highest-numbered element first)
WARNING: The ordering of elements is weird in big-endian mode,
because we use VSTM, as required by the EABI. GCC RTL defines
element ordering based on in-memory order. This can be differ
from the architectural ordering of elements within a NEON register.
The intrinsics defined in arm_neon.h use the NEON register element
ordering, not the GCC RTL element ordering.
will be (with lowest address first, h = most-significant byte,
l = least-significant byte of element):
For example, the in-memory ordering of a big-endian a quadword
vector with 16-bit elements when stored from register pair {d0,d1}
will be (lowest address first, d0[N] is NEON register element N):
[e3h, e3l, e2h, e2l, e1h, e1l, e0h, e0l,
e7h, e7l, e6h, e6l, e5h, e5l, e4h, e4l]
[d0[3], d0[2], d0[1], d0[0], d1[7], d1[6], d1[5], d1[4]]
When necessary, quadword registers (dN, dN+1) are moved to ARM registers from
rN in the order:
When necessary, quadword registers (dN, dN+1) are moved to ARM
registers from rN in the order:
dN -> (rN+1, rN), dN+1 -> (rN+3, rN+2)
So that STM/LDM can be used on vectors in ARM registers, and the same memory
layout will result as if VSTM/VLDM were used. */
So that STM/LDM can be used on vectors in ARM registers, and the
same memory layout will result as if VSTM/VLDM were used. */
const char *
output_move_neon (rtx *operands)
@ -13326,28 +13324,16 @@ arm_assemble_integer (rtx x, unsigned int size, int aligned_p)
if (arm_vector_mode_supported_p (mode))
{
int i, units;
unsigned int invmask = 0, parts_per_word;
gcc_assert (GET_CODE (x) == CONST_VECTOR);
units = CONST_VECTOR_NUNITS (x);
size = GET_MODE_SIZE (GET_MODE_INNER (mode));
/* For big-endian Neon vectors, we must permute the vector to the form
which, when loaded by a VLDR or VLDM instruction, will give a vector
with the elements in the right order. */
if (TARGET_NEON && WORDS_BIG_ENDIAN)
{
parts_per_word = UNITS_PER_WORD / size;
/* FIXME: This might be wrong for 64-bit vector elements, but we don't
support those anywhere yet. */
invmask = (parts_per_word == 0) ? 0 : (1 << (parts_per_word - 1)) - 1;
}
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
for (i = 0; i < units; i++)
{
rtx elt = CONST_VECTOR_ELT (x, i ^ invmask);
rtx elt = CONST_VECTOR_ELT (x, i);
assemble_integer
(elt, size, i == 0 ? BIGGEST_ALIGNMENT : size * BITS_PER_UNIT, 1);
}

View File

@ -735,7 +735,10 @@
(match_operand:SI 2 "immediate_operand" "i")))]
"TARGET_NEON"
{
operands[2] = GEN_INT (ffs ((int) INTVAL (operands[2]) - 1));
int elt = ffs ((int) INTVAL (operands[2]) - 1);
if (BYTES_BIG_ENDIAN)
elt = GET_MODE_NUNITS (<MODE>mode) - 1 - elt;
operands[2] = GEN_INT (elt);
return "vmov%?.<V_uf_sclr>\t%P0[%c2], %1";
}
@ -757,6 +760,9 @@
int hi = (elem / half_elts) * 2;
int regno = REGNO (operands[0]);
if (BYTES_BIG_ENDIAN)
elt = half_elts - 1 - elt;
operands[0] = gen_rtx_REG (<V_HALF>mode, regno + hi);
operands[2] = GEN_INT (elt);
@ -804,7 +810,15 @@
(match_operand:VD 1 "s_register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")])))]
"TARGET_NEON"
"vmov%?.<V_uf_sclr>\t%0, %P1[%c2]"
{
if (BYTES_BIG_ENDIAN)
{
int elt = INTVAL (operands[2]);
elt = GET_MODE_NUNITS (<MODE>mode) - 1 - elt;
operands[2] = GEN_INT (elt);
}
return "vmov%?.<V_uf_sclr>\t%0, %P1[%c2]";
}
[(set_attr "predicable" "yes")
(set_attr "neon_type" "neon_bp_simple")]
)
@ -821,6 +835,9 @@
int hi = (INTVAL (operands[2]) / half_elts) * 2;
int regno = REGNO (operands[1]);
if (BYTES_BIG_ENDIAN)
elt = half_elts - 1 - elt;
operands[1] = gen_rtx_REG (<V_HALF>mode, regno + hi);
operands[2] = GEN_INT (elt);
@ -2413,7 +2430,15 @@
(match_operand:VD 1 "s_register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
"TARGET_NEON"
"vmov%?.s<V_sz_elem>\t%0, %P1[%c2]"
{
if (BYTES_BIG_ENDIAN)
{
int elt = INTVAL (operands[2]);
elt = GET_MODE_NUNITS (<MODE>mode) - 1 - elt;
operands[2] = GEN_INT (elt);
}
return "vmov%?.s<V_sz_elem>\t%0, %P1[%c2]";
}
[(set_attr "predicable" "yes")
(set_attr "neon_type" "neon_bp_simple")]
)
@ -2425,7 +2450,15 @@
(match_operand:VD 1 "s_register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
"TARGET_NEON"
"vmov%?.u<V_sz_elem>\t%0, %P1[%c2]"
{
if (BYTES_BIG_ENDIAN)
{
int elt = INTVAL (operands[2]);
elt = GET_MODE_NUNITS (<MODE>mode) - 1 - elt;
operands[2] = GEN_INT (elt);
}
return "vmov%?.u<V_sz_elem>\t%0, %P1[%c2]";
}
[(set_attr "predicable" "yes")
(set_attr "neon_type" "neon_bp_simple")]
)
@ -2442,10 +2475,14 @@
int regno = REGNO (operands[1]);
unsigned int halfelts = GET_MODE_NUNITS (<MODE>mode) / 2;
unsigned int elt = INTVAL (operands[2]);
unsigned int elt_adj = elt % halfelts;
if (BYTES_BIG_ENDIAN)
elt_adj = halfelts - 1 - elt_adj;
ops[0] = operands[0];
ops[1] = gen_rtx_REG (<V_HALF>mode, regno + 2 * (elt / halfelts));
ops[2] = GEN_INT (elt % halfelts);
ops[2] = GEN_INT (elt_adj);
output_asm_insn ("vmov%?.s<V_sz_elem>\t%0, %P1[%c2]", ops);
return "";
@ -2466,10 +2503,14 @@
int regno = REGNO (operands[1]);
unsigned int halfelts = GET_MODE_NUNITS (<MODE>mode) / 2;
unsigned int elt = INTVAL (operands[2]);
unsigned int elt_adj = elt % halfelts;
if (BYTES_BIG_ENDIAN)
elt_adj = halfelts - 1 - elt_adj;
ops[0] = operands[0];
ops[1] = gen_rtx_REG (<V_HALF>mode, regno + 2 * (elt / halfelts));
ops[2] = GEN_INT (elt % halfelts);
ops[2] = GEN_INT (elt_adj);
output_asm_insn ("vmov%?.u<V_sz_elem>\t%0, %P1[%c2]", ops);
return "";
@ -2490,6 +2531,20 @@
neon_lane_bounds (operands[2], 0, GET_MODE_NUNITS (<MODE>mode));
if (BYTES_BIG_ENDIAN)
{
/* The intrinsics are defined in terms of a model where the
element ordering in memory is vldm order, whereas the generic
RTL is defined in terms of a model where the element ordering
in memory is array order. Convert the lane number to conform
to this model. */
unsigned int elt = INTVAL (operands[2]);
unsigned int reg_nelts
= 64 / GET_MODE_BITSIZE (GET_MODE_INNER (<MODE>mode));
elt ^= reg_nelts - 1;
operands[2] = GEN_INT (elt);
}
if ((magic & 3) == 3 || GET_MODE_BITSIZE (GET_MODE_INNER (<MODE>mode)) == 32)
insn = gen_vec_extract<mode> (operands[0], operands[1], operands[2]);
else