Add ldn/stn single support, fix ldnr support.

sim/aarch64/
	* simulator.c: (LDn_STn_SINGLE_LANE_AND_SIZE): New.
	(do_vec_LDn_single, do_vec_STn_single): New.
	(do_vec_LDnR): Add and set new nregs var.  Replace switch on nregs with
	loop over nregs using new var n.  Add n times size to address in loop.
	Add n to vd in loop.
	(do_vec_load_store): Add comment for instruction bit 24.  New var
	single to hold instruction bit 24.  Add new code to use single.  Move
	ldnr support inside single if statements.  Fix ldnr register counts
	inside post if statement.  Change HALT_NYI calls to HALT_UNALLOC.

	sim/testsuite/sim/aarch64/
	* ldn_single.s: New.
	* ldnr.s: New.
	* stn_single.s: New.
This commit is contained in:
Jim Wilson 2017-02-14 14:31:03 -08:00
parent 3f77c7691f
commit e8f42b5e36
6 changed files with 714 additions and 285 deletions

View File

@ -1,3 +1,15 @@
2017-02-14 Jim Wilson <jim.wilson@linaro.org>
* simulator.c: (LDn_STn_SINGLE_LANE_AND_SIZE): New.
(do_vec_LDn_single, do_vec_STn_single): New.
(do_vec_LDnR): Add and set new nregs var. Replace switch on nregs with
loop over nregs using new var n. Add n times size to address in loop.
Add n to vd in loop.
(do_vec_load_store): Add comment for instruction bit 24. New var
single to hold instruction bit 24. Add new code to use single. Move
ldnr support inside single if statements. Fix ldnr register counts
inside post if statement. Change HALT_NYI calls to HALT_UNALLOC.
2017-01-23 Jim Wilson <jim.wilson@linaro.org>
* simulator.c (do_vec_compare): Add case 0x23 for CMTST.

View File

@ -11560,6 +11560,178 @@ ST1_4 (sim_cpu *cpu, uint64_t address)
vec_store (cpu, address, 4);
}
#define LDn_STn_SINGLE_LANE_AND_SIZE() \
do \
{ \
switch (INSTR (15, 14)) \
{ \
case 0: \
lane = (full << 3) | (s << 2) | size; \
size = 0; \
break; \
\
case 1: \
if ((size & 1) == 1) \
HALT_UNALLOC; \
lane = (full << 2) | (s << 1) | (size >> 1); \
size = 1; \
break; \
\
case 2: \
if ((size & 2) == 2) \
HALT_UNALLOC; \
\
if ((size & 1) == 0) \
{ \
lane = (full << 1) | s; \
size = 2; \
} \
else \
{ \
if (s) \
HALT_UNALLOC; \
lane = full; \
size = 3; \
} \
break; \
\
default: \
HALT_UNALLOC; \
} \
} \
while (0)
/* Load single structure into one lane of N registers. */
static void
do_vec_LDn_single (sim_cpu *cpu, uint64_t address)
{
/* instr[31] = 0
instr[30] = element selector 0=>half, 1=>all elements
instr[29,24] = 00 1101
instr[23] = 0=>simple, 1=>post
instr[22] = 1
instr[21] = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11111 (immediate post inc)
instr[15,13] = opcode
instr[12] = S, used for lane number
instr[11,10] = size, also used for lane number
instr[9,5] = address
instr[4,0] = Vd */
unsigned full = INSTR (30, 30);
unsigned vd = INSTR (4, 0);
unsigned size = INSTR (11, 10);
unsigned s = INSTR (12, 12);
int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
int lane = 0;
int i;
NYI_assert (29, 24, 0x0D);
NYI_assert (22, 22, 1);
/* Compute the lane number first (using size), and then compute size. */
LDn_STn_SINGLE_LANE_AND_SIZE ();
for (i = 0; i < nregs; i++)
switch (size)
{
case 0:
{
uint8_t val = aarch64_get_mem_u8 (cpu, address + i);
aarch64_set_vec_u8 (cpu, vd + i, lane, val);
break;
}
case 1:
{
uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2));
aarch64_set_vec_u16 (cpu, vd + i, lane, val);
break;
}
case 2:
{
uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4));
aarch64_set_vec_u32 (cpu, vd + i, lane, val);
break;
}
case 3:
{
uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8));
aarch64_set_vec_u64 (cpu, vd + i, lane, val);
break;
}
}
}
/* Store single structure from one lane from N registers. */
static void
do_vec_STn_single (sim_cpu *cpu, uint64_t address)
{
/* instr[31] = 0
instr[30] = element selector 0=>half, 1=>all elements
instr[29,24] = 00 1101
instr[23] = 0=>simple, 1=>post
instr[22] = 0
instr[21] = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11111 (immediate post inc)
instr[15,13] = opcode
instr[12] = S, used for lane number
instr[11,10] = size, also used for lane number
instr[9,5] = address
instr[4,0] = Vd */
unsigned full = INSTR (30, 30);
unsigned vd = INSTR (4, 0);
unsigned size = INSTR (11, 10);
unsigned s = INSTR (12, 12);
int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
int lane = 0;
int i;
NYI_assert (29, 24, 0x0D);
NYI_assert (22, 22, 0);
/* Compute the lane number first (using size), and then compute size. */
LDn_STn_SINGLE_LANE_AND_SIZE ();
for (i = 0; i < nregs; i++)
switch (size)
{
case 0:
{
uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane);
aarch64_set_mem_u8 (cpu, address + i, val);
break;
}
case 1:
{
uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane);
aarch64_set_mem_u16 (cpu, address + (i * 2), val);
break;
}
case 2:
{
uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane);
aarch64_set_mem_u32 (cpu, address + (i * 4), val);
break;
}
case 3:
{
uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane);
aarch64_set_mem_u64 (cpu, address + (i * 8), val);
break;
}
}
}
/* Load single structure into all lanes of N registers. */
static void
do_vec_LDnR (sim_cpu *cpu, uint64_t address)
{
@ -11582,262 +11754,52 @@ do_vec_LDnR (sim_cpu *cpu, uint64_t address)
unsigned full = INSTR (30, 30);
unsigned vd = INSTR (4, 0);
unsigned size = INSTR (11, 10);
int i;
int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
int i, n;
NYI_assert (29, 24, 0x0D);
NYI_assert (22, 22, 1);
NYI_assert (15, 14, 3);
NYI_assert (12, 12, 0);
switch ((INSTR (13, 13) << 1) | INSTR (21, 21))
{
case 0: /* LD1R. */
switch (size)
for (n = 0; n < nregs; n++)
switch (size)
{
case 0:
{
case 0:
{
uint8_t val = aarch64_get_mem_u8 (cpu, address);
for (i = 0; i < (full ? 16 : 8); i++)
aarch64_set_vec_u8 (cpu, vd, i, val);
break;
}
case 1:
{
uint16_t val = aarch64_get_mem_u16 (cpu, address);
for (i = 0; i < (full ? 8 : 4); i++)
aarch64_set_vec_u16 (cpu, vd, i, val);
break;
}
case 2:
{
uint32_t val = aarch64_get_mem_u32 (cpu, address);
for (i = 0; i < (full ? 4 : 2); i++)
aarch64_set_vec_u32 (cpu, vd, i, val);
break;
}
case 3:
{
uint64_t val = aarch64_get_mem_u64 (cpu, address);
for (i = 0; i < (full ? 2 : 1); i++)
aarch64_set_vec_u64 (cpu, vd, i, val);
break;
}
default:
HALT_UNALLOC;
uint8_t val = aarch64_get_mem_u8 (cpu, address + n);
for (i = 0; i < (full ? 16 : 8); i++)
aarch64_set_vec_u8 (cpu, vd + n, i, val);
break;
}
break;
case 1: /* LD2R. */
switch (size)
case 1:
{
case 0:
{
uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
for (i = 0; i < (full ? 16 : 8); i++)
{
aarch64_set_vec_u8 (cpu, vd, 0, val1);
aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
}
break;
}
case 1:
{
uint16_t val1 = aarch64_get_mem_u16 (cpu, address);
uint16_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
for (i = 0; i < (full ? 8 : 4); i++)
{
aarch64_set_vec_u16 (cpu, vd, 0, val1);
aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
}
break;
}
case 2:
{
uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
for (i = 0; i < (full ? 4 : 2); i++)
{
aarch64_set_vec_u32 (cpu, vd, 0, val1);
aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
}
break;
}
case 3:
{
uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
for (i = 0; i < (full ? 2 : 1); i++)
{
aarch64_set_vec_u64 (cpu, vd, 0, val1);
aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
}
break;
}
default:
HALT_UNALLOC;
uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2));
for (i = 0; i < (full ? 8 : 4); i++)
aarch64_set_vec_u16 (cpu, vd + n, i, val);
break;
}
break;
case 2: /* LD3R. */
switch (size)
case 2:
{
case 0:
{
uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2);
for (i = 0; i < (full ? 16 : 8); i++)
{
aarch64_set_vec_u8 (cpu, vd, 0, val1);
aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
aarch64_set_vec_u8 (cpu, vd + 2, 0, val3);
}
}
uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4));
for (i = 0; i < (full ? 4 : 2); i++)
aarch64_set_vec_u32 (cpu, vd + n, i, val);
break;
case 1:
{
uint32_t val1 = aarch64_get_mem_u16 (cpu, address);
uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4);
for (i = 0; i < (full ? 8 : 4); i++)
{
aarch64_set_vec_u16 (cpu, vd, 0, val1);
aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
aarch64_set_vec_u16 (cpu, vd + 2, 0, val3);
}
}
break;
case 2:
{
uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8);
for (i = 0; i < (full ? 4 : 2); i++)
{
aarch64_set_vec_u32 (cpu, vd, 0, val1);
aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
aarch64_set_vec_u32 (cpu, vd + 2, 0, val3);
}
}
break;
case 3:
{
uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16);
for (i = 0; i < (full ? 2 : 1); i++)
{
aarch64_set_vec_u64 (cpu, vd, 0, val1);
aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
aarch64_set_vec_u64 (cpu, vd + 2, 0, val3);
}
}
break;
default:
HALT_UNALLOC;
}
break;
case 3: /* LD4R. */
switch (size)
case 3:
{
case 0:
{
uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2);
uint8_t val4 = aarch64_get_mem_u8 (cpu, address + 3);
for (i = 0; i < (full ? 16 : 8); i++)
{
aarch64_set_vec_u8 (cpu, vd, 0, val1);
aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
aarch64_set_vec_u8 (cpu, vd + 2, 0, val3);
aarch64_set_vec_u8 (cpu, vd + 3, 0, val4);
}
}
uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8));
for (i = 0; i < (full ? 2 : 1); i++)
aarch64_set_vec_u64 (cpu, vd + n, i, val);
break;
case 1:
{
uint32_t val1 = aarch64_get_mem_u16 (cpu, address);
uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4);
uint32_t val4 = aarch64_get_mem_u16 (cpu, address + 6);
for (i = 0; i < (full ? 8 : 4); i++)
{
aarch64_set_vec_u16 (cpu, vd, 0, val1);
aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
aarch64_set_vec_u16 (cpu, vd + 2, 0, val3);
aarch64_set_vec_u16 (cpu, vd + 3, 0, val4);
}
}
break;
case 2:
{
uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8);
uint32_t val4 = aarch64_get_mem_u32 (cpu, address + 12);
for (i = 0; i < (full ? 4 : 2); i++)
{
aarch64_set_vec_u32 (cpu, vd, 0, val1);
aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
aarch64_set_vec_u32 (cpu, vd + 2, 0, val3);
aarch64_set_vec_u32 (cpu, vd + 3, 0, val4);
}
}
break;
case 3:
{
uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16);
uint64_t val4 = aarch64_get_mem_u64 (cpu, address + 24);
for (i = 0; i < (full ? 2 : 1); i++)
{
aarch64_set_vec_u64 (cpu, vd, 0, val1);
aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
aarch64_set_vec_u64 (cpu, vd + 2, 0, val3);
aarch64_set_vec_u64 (cpu, vd + 3, 0, val4);
}
}
break;
default:
HALT_UNALLOC;
}
break;
default:
HALT_UNALLOC;
}
default:
HALT_UNALLOC;
}
}
static void
@ -11848,7 +11810,7 @@ do_vec_load_store (sim_cpu *cpu)
instr[31] = 0
instr[30] = element selector 0=>half, 1=>all elements
instr[29,25] = 00110
instr[24] = ?
instr[24] = 0=>multiple struct, 1=>single struct
instr[23] = 0=>simple, 1=>post
instr[22] = 0=>store, 1=>load
instr[21] = 0 (LDn) / small(0)-large(1) selector (LDnR)
@ -11876,6 +11838,7 @@ do_vec_load_store (sim_cpu *cpu)
instr[9,5] = Vn, can be SP
instr[4,0] = Vd */
int single;
int post;
int load;
unsigned vn;
@ -11885,15 +11848,16 @@ do_vec_load_store (sim_cpu *cpu)
if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
HALT_NYI;
type = INSTR (15, 12);
if (type != 0xE && type != 0xE && INSTR (21, 21) != 0)
HALT_NYI;
single = INSTR (24, 24);
post = INSTR (23, 23);
load = INSTR (22, 22);
type = INSTR (15, 12);
vn = INSTR (9, 5);
address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
if (! single && INSTR (21, 21) != 0)
HALT_UNALLOC;
if (post)
{
unsigned vm = INSTR (20, 16);
@ -11902,48 +11866,77 @@ do_vec_load_store (sim_cpu *cpu)
{
unsigned sizeof_operation;
switch (type)
if (single)
{
case 0: sizeof_operation = 32; break;
case 4: sizeof_operation = 24; break;
case 8: sizeof_operation = 16; break;
case 0xC:
sizeof_operation = INSTR (21, 21) ? 2 : 1;
sizeof_operation <<= INSTR (11, 10);
break;
case 0xE:
sizeof_operation = INSTR (21, 21) ? 8 : 4;
sizeof_operation <<= INSTR (11, 10);
break;
case 7:
/* One register, immediate offset variant. */
sizeof_operation = 8;
break;
case 10:
/* Two registers, immediate offset variant. */
sizeof_operation = 16;
break;
case 6:
/* Three registers, immediate offset variant. */
sizeof_operation = 24;
break;
case 2:
/* Four registers, immediate offset variant. */
sizeof_operation = 32;
break;
default:
HALT_UNALLOC;
if ((type >= 0) && (type <= 11))
{
int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
switch (INSTR (15, 14))
{
case 0:
sizeof_operation = nregs * 1;
break;
case 1:
sizeof_operation = nregs * 2;
break;
case 2:
if (INSTR (10, 10) == 0)
sizeof_operation = nregs * 4;
else
sizeof_operation = nregs * 8;
break;
default:
HALT_UNALLOC;
}
}
else if (type == 0xC)
{
sizeof_operation = INSTR (21, 21) ? 2 : 1;
sizeof_operation <<= INSTR (11, 10);
}
else if (type == 0xE)
{
sizeof_operation = INSTR (21, 21) ? 4 : 3;
sizeof_operation <<= INSTR (11, 10);
}
else
HALT_UNALLOC;
}
else
{
switch (type)
{
case 0: sizeof_operation = 32; break;
case 4: sizeof_operation = 24; break;
case 8: sizeof_operation = 16; break;
if (INSTR (30, 30))
sizeof_operation *= 2;
case 7:
/* One register, immediate offset variant. */
sizeof_operation = 8;
break;
case 10:
/* Two registers, immediate offset variant. */
sizeof_operation = 16;
break;
case 6:
/* Three registers, immediate offset variant. */
sizeof_operation = 24;
break;
case 2:
/* Four registers, immediate offset variant. */
sizeof_operation = 32;
break;
default:
HALT_UNALLOC;
}
if (INSTR (30, 30))
sizeof_operation *= 2;
}
aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
}
@ -11956,6 +11949,29 @@ do_vec_load_store (sim_cpu *cpu)
NYI_assert (20, 16, 0);
}
if (single)
{
if (load)
{
if ((type >= 0) && (type <= 11))
do_vec_LDn_single (cpu, address);
else if ((type == 0xC) || (type == 0xE))
do_vec_LDnR (cpu, address);
else
HALT_UNALLOC;
return;
}
/* Stores. */
if ((type >= 0) && (type <= 11))
{
do_vec_STn_single (cpu, address);
return;
}
HALT_UNALLOC;
}
if (load)
{
switch (type)
@ -11968,11 +11984,8 @@ do_vec_load_store (sim_cpu *cpu)
case 10: LD1_2 (cpu, address); return;
case 7: LD1_1 (cpu, address); return;
case 0xE:
case 0xC: do_vec_LDnR (cpu, address); return;
default:
HALT_NYI;
HALT_UNALLOC;
}
}
@ -11987,7 +12000,7 @@ do_vec_load_store (sim_cpu *cpu)
case 10: ST1_2 (cpu, address); return;
case 7: ST1_1 (cpu, address); return;
default:
HALT_NYI;
HALT_UNALLOC;
}
}

View File

@ -1,3 +1,9 @@
2017-02-14 Jim Wilson <jim.wilson@linaro.org>
* ldn_single.s: New.
* ldnr.s: New.
* stn_single.s: New.
2017-01-23 Jim Wilson <jim.wilson@linaro.org>
* cmtst.s: New.

View File

@ -0,0 +1,100 @@
# mach: aarch64
# Check the load single 1-element structure to one lane instructions:
# ld1, ld2, ld3, ld4.
# Check the addressing modes: no offset, post-index immediate offset,
# post-index register offset.
.include "testutils.inc"
input:
.word 0x04030201
.word 0x08070605
.word 0x0c0b0a09
.word 0x100f0e0d
.word 0x14131211
.word 0x18171615
.word 0x1c1b1a19
.word 0x201f1e1d
start
adrp x0, input
add x0, x0, :lo12:input
mov x2, x0
mov x3, #1
mov x4, #4
ld1 {v0.b}[0], [x2], 1
ld1 {v0.b}[1], [x2], x3
ld1 {v0.h}[1], [x2], 2
ld1 {v0.s}[1], [x2], x4
ld1 {v0.d}[1], [x2]
addv b1, v0.16b
mov x5, v1.d[0]
cmp x5, #136
bne .Lfailure
mov x2, x0
mov x3, #16
mov x4, #4
ld2 {v0.d, v1.d}[0], [x2], x3
ld2 {v0.s, v1.s}[2], [x2], 8
ld2 {v0.h, v1.h}[6], [x2], x4
ld2 {v0.b, v1.b}[14], [x2], 2
ld2 {v0.b, v1.b}[15], [x2]
addv b2, v0.16b
addv b3, v1.16b
mov x5, v2.d[0]
mov x6, v3.d[0]
cmp x5, #221
bne .Lfailure
cmp x6, #307
bne .Lfailure
mov x2, x0
ld3 {v0.s, v1.s, v2.s}[0], [x2], 12
ld3 {v0.s, v1.s, v2.s}[1], [x2]
mov x2, x0
mov x3, #12
ld3 {v0.s, v1.s, v2.s}[2], [x2], x3
ld3 {v0.s, v1.s, v2.s}[3], [x2]
addv b3, v0.16b
addv b4, v1.16b
addv b5, v2.16b
mov x4, v3.d[0]
mov x5, v4.d[0]
mov x6, v5.d[0]
cmp x4, #136
bne .Lfailure
cmp x5, #200
bne .Lfailure
cmp x6, #264
bne .Lfailure
mov x2, x0
ld4 {v0.s, v1.s, v2.s, v3.s}[0], [x2], 16
ld4 {v0.s, v1.s, v2.s, v3.s}[1], [x2]
mov x2, x0
mov x3, #16
ld4 {v0.s, v1.s, v2.s, v3.s}[2], [x2], x3
ld4 {v0.s, v1.s, v2.s, v3.s}[3], [x2]
addv b4, v0.16b
addv b5, v1.16b
addv b6, v2.16b
addv b7, v3.16b
mov x4, v4.d[0]
mov x5, v5.d[0]
mov x6, v6.d[0]
mov x7, v7.d[0]
cmp x4, #168
bne .Lfailure
cmp x5, #232
bne .Lfailure
cmp x6, #296
bne .Lfailure
cmp x7, #360
bne .Lfailure
pass
.Lfailure:
fail

View File

@ -0,0 +1,176 @@
# mach: aarch64
# Check the load single 1-element structure and replicate to all lanes insns:
# ld1r, ld2r, ld3r, ld4r.
# Check the addressing modes: no offset, post-index immediate offset,
# post-index register offset.
.include "testutils.inc"
input:
.word 0x04030201
.word 0x08070605
.word 0x0c0b0a09
.word 0x100f0e0d
input2:
.word 0x00000001
.word 0x00000002
.word 0x00000003
.word 0x00000004
.word 0x00000005
.word 0x00000006
.word 0x00000007
.word 0x00000008
.word 0x00000009
.word 0x0000000a
.word 0x0000000b
.word 0x0000000c
start
adrp x0, input
add x0, x0, :lo12:input
adrp x1, input2
add x1, x1, :lo12:input2
mov x2, x0
mov x3, #1
ld1r {v0.8b}, [x2], 1
ld1r {v1.16b}, [x2], x3
ld1r {v2.4h}, [x2], 2
ld1r {v3.8h}, [x2]
addv b0, v0.8b
addv b1, v1.16b
addv b2, v2.8b
addv b3, v3.16b
mov x2, v0.d[0]
mov x3, v1.d[0]
mov x4, v2.d[0]
mov x5, v3.d[0]
cmp x2, #8
bne .Lfailure
cmp x3, #32
bne .Lfailure
cmp x4, #28
bne .Lfailure
cmp x5, #88
bne .Lfailure
mov x2, x1
mov x3, #8
ld2r {v0.2s, v1.2s}, [x2], 8
ld2r {v2.4s, v3.4s}, [x2], x3
ld2r {v4.1d, v5.1d}, [x2], 16
ld2r {v6.2d, v7.2d}, [x2]
addp v0.2s, v0.2s, v1.2s
addv s2, v2.4s
addv s3, v3.4s
addp v4.2s, v4.2s, v5.2s
addv s6, v6.4s
addv s7, v7.4s
mov w2, v0.s[0]
mov w3, v0.s[1]
mov x4, v2.d[0]
mov x5, v3.d[0]
mov w6, v4.s[0]
mov w7, v4.s[1]
mov x8, v6.d[0]
mov x9, v7.d[0]
cmp w2, #2
bne .Lfailure
cmp w3, #4
bne .Lfailure
cmp x4, #12
bne .Lfailure
cmp x5, #16
bne .Lfailure
cmp w6, #11
bne .Lfailure
cmp w7, #15
bne .Lfailure
cmp x8, #38
bne .Lfailure
cmp x9, #46
bne .Lfailure
mov x2, x0
mov x3, #3
ld3r {v0.8b, v1.8b, v2.8b}, [x2], 3
ld3r {v3.8b, v4.8b, v5.8b}, [x2], x3
ld3r {v6.8b, v7.8b, v8.8b}, [x2]
addv b0, v0.8b
addv b1, v1.8b
addv b2, v2.8b
addv b3, v3.8b
addv b4, v4.8b
addv b5, v5.8b
addv b6, v6.8b
addv b7, v7.8b
addv b8, v8.8b
addv b9, v9.8b
mov x2, v0.d[0]
mov x3, v1.d[0]
mov x4, v2.d[0]
mov x5, v3.d[0]
mov x6, v4.d[0]
mov x7, v5.d[0]
mov x8, v6.d[0]
mov x9, v7.d[0]
mov x10, v8.d[0]
cmp x2, #8
bne .Lfailure
cmp x3, #16
bne .Lfailure
cmp x4, #24
bne .Lfailure
cmp x5, #32
bne .Lfailure
cmp x6, #40
bne .Lfailure
cmp x7, #48
bne .Lfailure
cmp x8, #56
bne .Lfailure
cmp x9, #64
bne .Lfailure
cmp x10, #72
bne .Lfailure
mov x2, x1
ld4r {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], 16
ld4r {v4.4s, v5.4s, v6.4s, v7.4s}, [x2]
addv s0, v0.4s
addv s1, v1.4s
addv s2, v2.4s
addv s3, v3.4s
addv s4, v4.4s
addv s5, v5.4s
addv s6, v6.4s
addv s7, v7.4s
mov x2, v0.d[0]
mov x3, v1.d[0]
mov x4, v2.d[0]
mov x5, v3.d[0]
mov x6, v4.d[0]
mov x7, v5.d[0]
mov x8, v6.d[0]
mov x9, v7.d[0]
cmp x2, #4
bne .Lfailure
cmp x3, #8
bne .Lfailure
cmp x4, #12
bne .Lfailure
cmp x5, #16
bne .Lfailure
cmp x6, #20
bne .Lfailure
cmp x7, #24
bne .Lfailure
cmp x8, #28
bne .Lfailure
cmp x9, #32
bne .Lfailure
pass
.Lfailure:
fail

View File

@ -0,0 +1,122 @@
# mach: aarch64
# Check the store single 1-element structure to one lane instructions:
# st1, st2, st3, st4.
# Check the addressing modes: no offset, post-index immediate offset,
# post-index register offset.
.include "testutils.inc"
input:
.word 0x04030201
.word 0x08070605
.word 0x0c0b0a09
.word 0x100f0e0d
.word 0x14131211
.word 0x18171615
.word 0x1c1b1a19
.word 0x201f1e1d
output:
.zero 64
start
adrp x0, input
add x0, x0, :lo12:input
adrp x1, output
add x1, x1, :lo12:output
mov x2, x0
ldr q0, [x2], 8
ldr q1, [x2]
mov x2, x0
ldr q2, [x2], 8
ldr q3, [x2]
mov x2, x1
mov x3, #1
mov x4, #4
st1 {v0.b}[0], [x2], 1
st1 {v0.b}[1], [x2], x3
st1 {v0.h}[1], [x2], 2
st1 {v0.s}[1], [x2], x4
st1 {v0.d}[1], [x2]
ldr q4, [x1]
addv b4, v4.16b
mov x5, v4.d[0]
cmp x5, #136
bne .Lfailure
mov x2, x1
mov x3, #16
mov x4, #4
st2 {v0.d, v1.d}[0], [x2], x3
st2 {v0.s, v1.s}[2], [x2], 8
st2 {v0.h, v1.h}[6], [x2], x4
st2 {v0.b, v1.b}[14], [x2], 2
st2 {v0.b, v1.b}[15], [x2]
mov x2, x1
ldr q4, [x2], 16
ldr q5, [x2]
addv b4, v4.16b
addv b5, v5.16b
mov x5, v4.d[0]
mov x6, v5.d[0]
cmp x5, #136
bne .Lfailure
cmp x6, #264
bne .Lfailure
mov x2, x1
mov x3, #12
st3 {v0.s, v1.s, v2.s}[0], [x2], 12
st3 {v0.s, v1.s, v2.s}[1], [x2], x3
st3 {v0.s, v1.s, v2.s}[2], [x2], 12
st3 {v0.s, v1.s, v2.s}[3], [x2]
mov x2, x1
ldr q4, [x2], 16
ldr q5, [x2], 16
ldr q6, [x2]
addv b4, v4.16b
addv b5, v5.16b
addv b6, v6.16b
mov x4, v4.d[0]
mov x5, v5.d[0]
mov x6, v6.d[0]
cmp x4, #88
bne .Lfailure
cmp x5, #200
bne .Lfailure
cmp x6, #248
bne .Lfailure
mov x2, x1
mov x3, #16
st4 {v0.s, v1.s, v2.s, v3.s}[0], [x2], 16
st4 {v0.s, v1.s, v2.s, v3.s}[1], [x2], x3
st4 {v0.s, v1.s, v2.s, v3.s}[2], [x2], 16
st4 {v0.s, v1.s, v2.s, v3.s}[3], [x2]
mov x2, x1
ldr q4, [x2], 16
ldr q5, [x2], 16
ldr q6, [x2], 16
ldr q7, [x2]
addv b4, v4.16b
addv b5, v5.16b
addv b6, v6.16b
addv b7, v7.16b
mov x4, v4.d[0]
mov x5, v5.d[0]
mov x6, v6.d[0]
mov x7, v7.d[0]
cmp x4, #104
bne .Lfailure
cmp x5, #168
bne .Lfailure
cmp x6, #232
bne .Lfailure
cmp x7, #296
bne .Lfailure
pass
.Lfailure:
fail