Fix ldn/stn multiple instructions. Fix testcases with unaligned data.
sim/aarch64/ * simulator.c (vec_load): Add M argument. Rewrite to iterate over registers based on structure size. (LD4, LD3, LD2, LD1_2, LD1_3, LD1_4): Pass new arg to vec_load. (LD1_1): Replace with call to vec_load. (vec_store): Add new M argument. Rewrite to iterate over registers based on structure size. (ST4, ST3, ST2, ST1_2, ST1_3, ST1_4): Pass new arg to vec_store. (ST1_1): Replace with call to vec_store. sim/testsuite/sim/aarch64/ * fcvtz.s, fstur.s, ldn_single.s, ldnr.s, mla.s, mls.s, uzp.s: Align data. * sumulh.s: Delete unnecessary data alignment. * stn_single.s: Align data. Fix unaligned ldr insns. Adjust cmp arguments to match change. * ldn_multiple.s, stn_multiple.s: New.
This commit is contained in:
parent
10f489e576
commit
bf1554384b
|
@ -1,3 +1,14 @@
|
|||
2017-04-22 Jim Wilson <jim.wilson@linaro.org>
|
||||
|
||||
* simulator.c (vec_load): Add M argument. Rewrite to iterate over
|
||||
registers based on structure size.
|
||||
(LD4, LD3, LD2, LD1_2, LD1_3, LD1_4): Pass new arg to vec_load.
|
||||
(LD1_1): Replace with call to vec_load.
|
||||
(vec_store): Add new M argument. Rewrite to iterate over registers
|
||||
based on structure size.
|
||||
(ST4, ST3, ST2, ST1_2, ST1_3, ST1_4): Pass new arg to vec_store.
|
||||
(ST1_1): Replace with call to vec_store.
|
||||
|
||||
2017-04-08 Jim Wilson <jim.wilson@linaro.org>
|
||||
|
||||
* simulator.c (do_vec_FCVTL): New.
|
||||
|
|
|
@ -11524,310 +11524,224 @@ vec_reg (unsigned v, unsigned o)
|
|||
return (v + o) & 0x3F;
|
||||
}
|
||||
|
||||
/* Load multiple N-element structures to N consecutive registers. */
|
||||
/* Load multiple N-element structures to M consecutive registers. */
|
||||
static void
|
||||
vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
|
||||
vec_load (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
|
||||
{
|
||||
int all = INSTR (30, 30);
|
||||
unsigned size = INSTR (11, 10);
|
||||
unsigned vd = INSTR (4, 0);
|
||||
unsigned i;
|
||||
unsigned rpt = (N == M) ? 1 : M;
|
||||
unsigned selem = N;
|
||||
unsigned i, j, k;
|
||||
|
||||
switch (size)
|
||||
{
|
||||
case 0: /* 8-bit operations. */
|
||||
if (all)
|
||||
for (i = 0; i < (16 * N); i++)
|
||||
aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
|
||||
aarch64_get_mem_u8 (cpu, address + i));
|
||||
else
|
||||
for (i = 0; i < (8 * N); i++)
|
||||
aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
|
||||
aarch64_get_mem_u8 (cpu, address + i));
|
||||
for (i = 0; i < rpt; i++)
|
||||
for (j = 0; j < (8 + (8 * all)); j++)
|
||||
for (k = 0; k < selem; k++)
|
||||
{
|
||||
aarch64_set_vec_u8 (cpu, vec_reg (vd, i + k), j,
|
||||
aarch64_get_mem_u8 (cpu, address));
|
||||
address += 1;
|
||||
}
|
||||
return;
|
||||
|
||||
case 1: /* 16-bit operations. */
|
||||
if (all)
|
||||
for (i = 0; i < (8 * N); i++)
|
||||
aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
|
||||
aarch64_get_mem_u16 (cpu, address + i * 2));
|
||||
else
|
||||
for (i = 0; i < (4 * N); i++)
|
||||
aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
|
||||
aarch64_get_mem_u16 (cpu, address + i * 2));
|
||||
for (i = 0; i < rpt; i++)
|
||||
for (j = 0; j < (4 + (4 * all)); j++)
|
||||
for (k = 0; k < selem; k++)
|
||||
{
|
||||
aarch64_set_vec_u16 (cpu, vec_reg (vd, i + k), j,
|
||||
aarch64_get_mem_u16 (cpu, address));
|
||||
address += 2;
|
||||
}
|
||||
return;
|
||||
|
||||
case 2: /* 32-bit operations. */
|
||||
if (all)
|
||||
for (i = 0; i < (4 * N); i++)
|
||||
aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
|
||||
aarch64_get_mem_u32 (cpu, address + i * 4));
|
||||
else
|
||||
for (i = 0; i < (2 * N); i++)
|
||||
aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
|
||||
aarch64_get_mem_u32 (cpu, address + i * 4));
|
||||
for (i = 0; i < rpt; i++)
|
||||
for (j = 0; j < (2 + (2 * all)); j++)
|
||||
for (k = 0; k < selem; k++)
|
||||
{
|
||||
aarch64_set_vec_u32 (cpu, vec_reg (vd, i + k), j,
|
||||
aarch64_get_mem_u32 (cpu, address));
|
||||
address += 4;
|
||||
}
|
||||
return;
|
||||
|
||||
case 3: /* 64-bit operations. */
|
||||
if (all)
|
||||
for (i = 0; i < (2 * N); i++)
|
||||
aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
|
||||
aarch64_get_mem_u64 (cpu, address + i * 8));
|
||||
else
|
||||
for (i = 0; i < N; i++)
|
||||
aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
|
||||
aarch64_get_mem_u64 (cpu, address + i * 8));
|
||||
for (i = 0; i < rpt; i++)
|
||||
for (j = 0; j < (1 + all); j++)
|
||||
for (k = 0; k < selem; k++)
|
||||
{
|
||||
aarch64_set_vec_u64 (cpu, vec_reg (vd, i + k), j,
|
||||
aarch64_get_mem_u64 (cpu, address));
|
||||
address += 8;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* LD4: load multiple 4-element to four consecutive registers. */
|
||||
/* Load multiple 4-element structures into four consecutive registers. */
|
||||
static void
|
||||
LD4 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
vec_load (cpu, address, 4);
|
||||
vec_load (cpu, address, 4, 4);
|
||||
}
|
||||
|
||||
/* LD3: load multiple 3-element structures to three consecutive registers. */
|
||||
/* Load multiple 3-element structures into three consecutive registers. */
|
||||
static void
|
||||
LD3 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
vec_load (cpu, address, 3);
|
||||
vec_load (cpu, address, 3, 3);
|
||||
}
|
||||
|
||||
/* LD2: load multiple 2-element structures to two consecutive registers. */
|
||||
/* Load multiple 2-element structures into two consecutive registers. */
|
||||
static void
|
||||
LD2 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
vec_load (cpu, address, 2);
|
||||
vec_load (cpu, address, 2, 2);
|
||||
}
|
||||
|
||||
/* Load multiple 1-element structures into one register. */
|
||||
static void
|
||||
LD1_1 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
int all = INSTR (30, 30);
|
||||
unsigned size = INSTR (11, 10);
|
||||
unsigned vd = INSTR (4, 0);
|
||||
unsigned i;
|
||||
|
||||
switch (size)
|
||||
{
|
||||
case 0:
|
||||
/* LD1 {Vd.16b}, addr, #16 */
|
||||
/* LD1 {Vd.8b}, addr, #8 */
|
||||
for (i = 0; i < (all ? 16 : 8); i++)
|
||||
aarch64_set_vec_u8 (cpu, vd, i,
|
||||
aarch64_get_mem_u8 (cpu, address + i));
|
||||
return;
|
||||
|
||||
case 1:
|
||||
/* LD1 {Vd.8h}, addr, #16 */
|
||||
/* LD1 {Vd.4h}, addr, #8 */
|
||||
for (i = 0; i < (all ? 8 : 4); i++)
|
||||
aarch64_set_vec_u16 (cpu, vd, i,
|
||||
aarch64_get_mem_u16 (cpu, address + i * 2));
|
||||
return;
|
||||
|
||||
case 2:
|
||||
/* LD1 {Vd.4s}, addr, #16 */
|
||||
/* LD1 {Vd.2s}, addr, #8 */
|
||||
for (i = 0; i < (all ? 4 : 2); i++)
|
||||
aarch64_set_vec_u32 (cpu, vd, i,
|
||||
aarch64_get_mem_u32 (cpu, address + i * 4));
|
||||
return;
|
||||
|
||||
case 3:
|
||||
/* LD1 {Vd.2d}, addr, #16 */
|
||||
/* LD1 {Vd.1d}, addr, #8 */
|
||||
for (i = 0; i < (all ? 2 : 1); i++)
|
||||
aarch64_set_vec_u64 (cpu, vd, i,
|
||||
aarch64_get_mem_u64 (cpu, address + i * 8));
|
||||
return;
|
||||
}
|
||||
vec_load (cpu, address, 1, 1);
|
||||
}
|
||||
|
||||
/* Load multiple 1-element structures into two registers. */
|
||||
static void
|
||||
LD1_2 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
/* FIXME: This algorithm is *exactly* the same as the LD2 version.
|
||||
So why have two different instructions ? There must be something
|
||||
wrong somewhere. */
|
||||
vec_load (cpu, address, 2);
|
||||
vec_load (cpu, address, 1, 2);
|
||||
}
|
||||
|
||||
/* Load multiple 1-element structures into three registers. */
|
||||
static void
|
||||
LD1_3 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
/* FIXME: This algorithm is *exactly* the same as the LD3 version.
|
||||
So why have two different instructions ? There must be something
|
||||
wrong somewhere. */
|
||||
vec_load (cpu, address, 3);
|
||||
vec_load (cpu, address, 1, 3);
|
||||
}
|
||||
|
||||
/* Load multiple 1-element structures into four registers. */
|
||||
static void
|
||||
LD1_4 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
/* FIXME: This algorithm is *exactly* the same as the LD4 version.
|
||||
So why have two different instructions ? There must be something
|
||||
wrong somewhere. */
|
||||
vec_load (cpu, address, 4);
|
||||
vec_load (cpu, address, 1, 4);
|
||||
}
|
||||
|
||||
/* Store multiple N-element structures to N consecutive registers. */
|
||||
/* Store multiple N-element structures from M consecutive registers. */
|
||||
static void
|
||||
vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
|
||||
vec_store (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
|
||||
{
|
||||
int all = INSTR (30, 30);
|
||||
unsigned size = INSTR (11, 10);
|
||||
unsigned vd = INSTR (4, 0);
|
||||
unsigned i;
|
||||
unsigned rpt = (N == M) ? 1 : M;
|
||||
unsigned selem = N;
|
||||
unsigned i, j, k;
|
||||
|
||||
switch (size)
|
||||
{
|
||||
case 0: /* 8-bit operations. */
|
||||
if (all)
|
||||
for (i = 0; i < (16 * N); i++)
|
||||
aarch64_set_mem_u8
|
||||
(cpu, address + i,
|
||||
aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
|
||||
else
|
||||
for (i = 0; i < (8 * N); i++)
|
||||
aarch64_set_mem_u8
|
||||
(cpu, address + i,
|
||||
aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
|
||||
for (i = 0; i < rpt; i++)
|
||||
for (j = 0; j < (8 + (8 * all)); j++)
|
||||
for (k = 0; k < selem; k++)
|
||||
{
|
||||
aarch64_set_mem_u8
|
||||
(cpu, address,
|
||||
aarch64_get_vec_u8 (cpu, vec_reg (vd, i + k), j));
|
||||
address += 1;
|
||||
}
|
||||
return;
|
||||
|
||||
case 1: /* 16-bit operations. */
|
||||
if (all)
|
||||
for (i = 0; i < (8 * N); i++)
|
||||
aarch64_set_mem_u16
|
||||
(cpu, address + i * 2,
|
||||
aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
|
||||
else
|
||||
for (i = 0; i < (4 * N); i++)
|
||||
aarch64_set_mem_u16
|
||||
(cpu, address + i * 2,
|
||||
aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
|
||||
for (i = 0; i < rpt; i++)
|
||||
for (j = 0; j < (4 + (4 * all)); j++)
|
||||
for (k = 0; k < selem; k++)
|
||||
{
|
||||
aarch64_set_mem_u16
|
||||
(cpu, address,
|
||||
aarch64_get_vec_u16 (cpu, vec_reg (vd, i + k), j));
|
||||
address += 2;
|
||||
}
|
||||
return;
|
||||
|
||||
case 2: /* 32-bit operations. */
|
||||
if (all)
|
||||
for (i = 0; i < (4 * N); i++)
|
||||
aarch64_set_mem_u32
|
||||
(cpu, address + i * 4,
|
||||
aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
|
||||
else
|
||||
for (i = 0; i < (2 * N); i++)
|
||||
aarch64_set_mem_u32
|
||||
(cpu, address + i * 4,
|
||||
aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
|
||||
for (i = 0; i < rpt; i++)
|
||||
for (j = 0; j < (2 + (2 * all)); j++)
|
||||
for (k = 0; k < selem; k++)
|
||||
{
|
||||
aarch64_set_mem_u32
|
||||
(cpu, address,
|
||||
aarch64_get_vec_u32 (cpu, vec_reg (vd, i + k), j));
|
||||
address += 4;
|
||||
}
|
||||
return;
|
||||
|
||||
case 3: /* 64-bit operations. */
|
||||
if (all)
|
||||
for (i = 0; i < (2 * N); i++)
|
||||
aarch64_set_mem_u64
|
||||
(cpu, address + i * 8,
|
||||
aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
|
||||
else
|
||||
for (i = 0; i < N; i++)
|
||||
aarch64_set_mem_u64
|
||||
(cpu, address + i * 8,
|
||||
aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
|
||||
for (i = 0; i < rpt; i++)
|
||||
for (j = 0; j < (1 + all); j++)
|
||||
for (k = 0; k < selem; k++)
|
||||
{
|
||||
aarch64_set_mem_u64
|
||||
(cpu, address,
|
||||
aarch64_get_vec_u64 (cpu, vec_reg (vd, i + k), j));
|
||||
address += 8;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Store multiple 4-element structure to four consecutive registers. */
|
||||
/* Store multiple 4-element structure from four consecutive registers. */
|
||||
static void
|
||||
ST4 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
vec_store (cpu, address, 4);
|
||||
vec_store (cpu, address, 4, 4);
|
||||
}
|
||||
|
||||
/* Store multiple 3-element structures to three consecutive registers. */
|
||||
/* Store multiple 3-element structures from three consecutive registers. */
|
||||
static void
|
||||
ST3 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
vec_store (cpu, address, 3);
|
||||
vec_store (cpu, address, 3, 3);
|
||||
}
|
||||
|
||||
/* Store multiple 2-element structures to two consecutive registers. */
|
||||
/* Store multiple 2-element structures from two consecutive registers. */
|
||||
static void
|
||||
ST2 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
vec_store (cpu, address, 2);
|
||||
vec_store (cpu, address, 2, 2);
|
||||
}
|
||||
|
||||
/* Store multiple 1-element structures into one register. */
|
||||
/* Store multiple 1-element structures from one register. */
|
||||
static void
|
||||
ST1_1 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
int all = INSTR (30, 30);
|
||||
unsigned size = INSTR (11, 10);
|
||||
unsigned vd = INSTR (4, 0);
|
||||
unsigned i;
|
||||
|
||||
switch (size)
|
||||
{
|
||||
case 0:
|
||||
for (i = 0; i < (all ? 16 : 8); i++)
|
||||
aarch64_set_mem_u8 (cpu, address + i,
|
||||
aarch64_get_vec_u8 (cpu, vd, i));
|
||||
return;
|
||||
|
||||
case 1:
|
||||
for (i = 0; i < (all ? 8 : 4); i++)
|
||||
aarch64_set_mem_u16 (cpu, address + i * 2,
|
||||
aarch64_get_vec_u16 (cpu, vd, i));
|
||||
return;
|
||||
|
||||
case 2:
|
||||
for (i = 0; i < (all ? 4 : 2); i++)
|
||||
aarch64_set_mem_u32 (cpu, address + i * 4,
|
||||
aarch64_get_vec_u32 (cpu, vd, i));
|
||||
return;
|
||||
|
||||
case 3:
|
||||
for (i = 0; i < (all ? 2 : 1); i++)
|
||||
aarch64_set_mem_u64 (cpu, address + i * 8,
|
||||
aarch64_get_vec_u64 (cpu, vd, i));
|
||||
return;
|
||||
}
|
||||
vec_store (cpu, address, 1, 1);
|
||||
}
|
||||
|
||||
/* Store multiple 1-element structures into two registers. */
|
||||
/* Store multiple 1-element structures from two registers. */
|
||||
static void
|
||||
ST1_2 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
/* FIXME: This algorithm is *exactly* the same as the ST2 version.
|
||||
So why have two different instructions ? There must be
|
||||
something wrong somewhere. */
|
||||
vec_store (cpu, address, 2);
|
||||
vec_store (cpu, address, 1, 2);
|
||||
}
|
||||
|
||||
/* Store multiple 1-element structures into three registers. */
|
||||
/* Store multiple 1-element structures from three registers. */
|
||||
static void
|
||||
ST1_3 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
/* FIXME: This algorithm is *exactly* the same as the ST3 version.
|
||||
So why have two different instructions ? There must be
|
||||
something wrong somewhere. */
|
||||
vec_store (cpu, address, 3);
|
||||
vec_store (cpu, address, 1, 3);
|
||||
}
|
||||
|
||||
/* Store multiple 1-element structures into four registers. */
|
||||
/* Store multiple 1-element structures from four registers. */
|
||||
static void
|
||||
ST1_4 (sim_cpu *cpu, uint64_t address)
|
||||
{
|
||||
/* FIXME: This algorithm is *exactly* the same as the ST4 version.
|
||||
So why have two different instructions ? There must be
|
||||
something wrong somewhere. */
|
||||
vec_store (cpu, address, 4);
|
||||
vec_store (cpu, address, 1, 4);
|
||||
}
|
||||
|
||||
#define LDn_STn_SINGLE_LANE_AND_SIZE() \
|
||||
|
|
|
@ -1,3 +1,12 @@
|
|||
2017-04-22 Jim Wilson <jim.wilson@linaro.org>
|
||||
|
||||
* fcvtz.s, fstur.s, ldn_single.s, ldnr.s, mla.s, mls.s, uzp.s: Align
|
||||
data.
|
||||
* sumulh.s: Delete unnecessary data alignment.
|
||||
* stn_single.s: Align data. Fix unaligned ldr insns. Adjust cmp
|
||||
arguments to match change.
|
||||
* ldn_multiple.s, stn_multiple.s: New.
|
||||
|
||||
2017-04-08 Jim Wilson <jim.wilson@linaro.org>
|
||||
|
||||
* fcvtl.s: New.
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
# For 64-bit unsigned convert, test values 1.5, LONG_MAX, and ULONG_MAX.
|
||||
|
||||
.data
|
||||
.align 4
|
||||
fm1p5:
|
||||
.word 3217031168
|
||||
fimax:
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
.include "testutils.inc"
|
||||
|
||||
.data
|
||||
.align 4
|
||||
fm1:
|
||||
.word 3212836864
|
||||
fmax:
|
||||
|
|
|
@ -0,0 +1,136 @@
|
|||
# mach: aarch64
|
||||
|
||||
# Check the load multiple structure instructions: ld1, ld2, ld3, ld4.
|
||||
# Check the addressing modes: no offset, post-index immediate offset,
|
||||
# post-index register offset.
|
||||
|
||||
.include "testutils.inc"
|
||||
|
||||
.data
|
||||
.align 4
|
||||
input:
|
||||
.word 0x04030201
|
||||
.word 0x08070605
|
||||
.word 0x0c0b0a09
|
||||
.word 0x100f0e0d
|
||||
.word 0xfcfdfeff
|
||||
.word 0xf8f9fafb
|
||||
.word 0xf4f5f6f7
|
||||
.word 0xf0f1f2f3
|
||||
|
||||
start
|
||||
adrp x0, input
|
||||
add x0, x0, :lo12:input
|
||||
|
||||
mov x2, x0
|
||||
mov x3, #16
|
||||
ld1 {v0.16b}, [x2], 16
|
||||
ld1 {v1.8h}, [x2], x3
|
||||
addv b4, v0.16b
|
||||
addv b5, v1.16b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #136
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #120
|
||||
bne .Lfailure
|
||||
|
||||
mov x2, x0
|
||||
mov x3, #16
|
||||
ld2 {v0.8b, v1.8b}, [x2], x3
|
||||
ld2 {v2.4h, v3.4h}, [x2], 16
|
||||
addv b4, v0.8b
|
||||
addv b5, v1.8b
|
||||
addv b6, v2.8b
|
||||
addv b7, v3.8b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #64
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #72
|
||||
bne .Lfailure
|
||||
mov x6, v6.d[0]
|
||||
cmp x6, #196
|
||||
bne .Lfailure
|
||||
mov x7, v7.d[0]
|
||||
cmp x7, #180
|
||||
bne .Lfailure
|
||||
|
||||
mov x2, x0
|
||||
ld3 {v0.2s, v1.2s, v2.2s}, [x2]
|
||||
addv b4, v0.8b
|
||||
addv b5, v1.8b
|
||||
addv b6, v2.8b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #68
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #16
|
||||
bne .Lfailure
|
||||
mov x6, v6.d[0]
|
||||
cmp x6, #16
|
||||
bne .Lfailure
|
||||
|
||||
mov x2, x0
|
||||
ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x2]
|
||||
addv b4, v0.8b
|
||||
addv b5, v1.8b
|
||||
addv b6, v2.8b
|
||||
addv b7, v3.8b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #0
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #0
|
||||
bne .Lfailure
|
||||
mov x6, v6.d[0]
|
||||
cmp x6, #0
|
||||
bne .Lfailure
|
||||
mov x7, v7.d[0]
|
||||
cmp x7, #0
|
||||
bne .Lfailure
|
||||
|
||||
mov x2, x0
|
||||
ld1 {v0.4s, v1.4s}, [x2]
|
||||
addv b4, v0.16b
|
||||
addv b5, v1.16b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #136
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #120
|
||||
bne .Lfailure
|
||||
|
||||
mov x2, x0
|
||||
ld1 {v0.1d, v1.1d, v2.1d}, [x2]
|
||||
addv b4, v0.8b
|
||||
addv b5, v1.8b
|
||||
addv b6, v2.8b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #36
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #100
|
||||
bne .Lfailure
|
||||
mov x6, v6.d[0]
|
||||
cmp x6, #220
|
||||
bne .Lfailure
|
||||
|
||||
mov x2, x0
|
||||
ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x2]
|
||||
addv b4, v0.8b
|
||||
addv b5, v1.8b
|
||||
addv b6, v2.8b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #36
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #100
|
||||
bne .Lfailure
|
||||
mov x6, v6.d[0]
|
||||
cmp x6, #220
|
||||
bne .Lfailure
|
||||
|
||||
pass
|
||||
.Lfailure:
|
||||
fail
|
|
@ -7,6 +7,8 @@
|
|||
|
||||
.include "testutils.inc"
|
||||
|
||||
.data
|
||||
.align 4
|
||||
input:
|
||||
.word 0x04030201
|
||||
.word 0x08070605
|
||||
|
|
|
@ -7,6 +7,8 @@
|
|||
|
||||
.include "testutils.inc"
|
||||
|
||||
.data
|
||||
.align 4
|
||||
input:
|
||||
.word 0x04030201
|
||||
.word 0x08070605
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
|
||||
.include "testutils.inc"
|
||||
|
||||
.data
|
||||
.align 4
|
||||
input:
|
||||
.word 0x04030201
|
||||
.word 0x08070605
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
|
||||
.include "testutils.inc"
|
||||
|
||||
.data
|
||||
.align 4
|
||||
input:
|
||||
.word 0x04030201
|
||||
.word 0x08070605
|
||||
|
|
|
@ -0,0 +1,171 @@
|
|||
# mach: aarch64
|
||||
|
||||
# Check the store multiple structure instructions: st1, st2, st3, st4.
|
||||
# Check the addressing modes: no offset, post-index immediate offset,
|
||||
# post-index register offset.
|
||||
|
||||
.include "testutils.inc"
|
||||
|
||||
.data
|
||||
.align 4
|
||||
input:
|
||||
.word 0x04030201
|
||||
.word 0x08070605
|
||||
.word 0x0c0b0a09
|
||||
.word 0x100f0e0d
|
||||
.word 0xfcfdfeff
|
||||
.word 0xf8f9fafb
|
||||
.word 0xf4f5f6f7
|
||||
.word 0xf0f1f2f3
|
||||
output:
|
||||
.zero 64
|
||||
|
||||
start
|
||||
adrp x0, input
|
||||
add x0, x0, :lo12:input
|
||||
adrp x1, output
|
||||
add x1, x1, :lo12:output
|
||||
|
||||
mov x2, x0
|
||||
ldr q0, [x2], 16
|
||||
ldr q1, [x2]
|
||||
mov x2, x0
|
||||
ldr q2, [x2], 16
|
||||
ldr q3, [x2]
|
||||
|
||||
mov x2, x1
|
||||
mov x3, #16
|
||||
st1 {v0.16b}, [x2], 16
|
||||
st1 {v1.8h}, [x2], x3
|
||||
mov x2, x1
|
||||
ldr q4, [x2], 16
|
||||
ldr q5, [x2]
|
||||
addv b4, v4.16b
|
||||
addv b5, v5.16b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #136
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #120
|
||||
bne .Lfailure
|
||||
|
||||
mov x2, x1
|
||||
mov x3, #16
|
||||
st2 {v0.8b, v1.8b}, [x2], 16
|
||||
st2 {v2.4h, v3.4h}, [x2], x3
|
||||
mov x2, x1
|
||||
ldr q4, [x2], 16
|
||||
ldr q5, [x2]
|
||||
addv b4, v4.16b
|
||||
addv b5, v5.16b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #0
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #0
|
||||
bne .Lfailure
|
||||
|
||||
mov x2, x1
|
||||
st3 {v0.4s, v1.4s, v2.4s}, [x2]
|
||||
ldr q4, [x2], 16
|
||||
ldr q5, [x2], 16
|
||||
ldr q6, [x2]
|
||||
addv b4, v4.16b
|
||||
addv b5, v5.16b
|
||||
addv b6, v6.16b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #36
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #0
|
||||
bne .Lfailure
|
||||
mov x6, v6.d[0]
|
||||
cmp x6, #100
|
||||
bne .Lfailure
|
||||
|
||||
mov x2, x1
|
||||
st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x2]
|
||||
ldr q4, [x2], 16
|
||||
ldr q5, [x2], 16
|
||||
ldr q6, [x2], 16
|
||||
ldr q7, [x2]
|
||||
addv b4, v4.16b
|
||||
addv b5, v5.16b
|
||||
addv b6, v6.16b
|
||||
addv b7, v7.16b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #0
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #0
|
||||
bne .Lfailure
|
||||
mov x6, v6.d[0]
|
||||
cmp x6, #0
|
||||
bne .Lfailure
|
||||
mov x7, v7.d[0]
|
||||
cmp x7, #0
|
||||
bne .Lfailure
|
||||
|
||||
pass
|
||||
|
||||
mov x2, x1
|
||||
st1 {v0.2s, v1.2s}, [x2], 16
|
||||
st1 {v2.1d, v3.1d}, [x2]
|
||||
mov x2, x1
|
||||
ldr q4, [x2], 16
|
||||
ldr q5, [x2]
|
||||
addv b4, v4.16b
|
||||
addv b5, v5.16b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #0
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #0
|
||||
bne .Lfailure
|
||||
|
||||
mov x2, x1
|
||||
st1 {v0.2d, v1.2d, v2.2d}, [x2]
|
||||
mov x2, x1
|
||||
ldr q4, [x2], 16
|
||||
ldr q5, [x2], 16
|
||||
ldr q6, [x2]
|
||||
addv b4, v4.16b
|
||||
addv b5, v5.16b
|
||||
addv b6, v6.16b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #136
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #120
|
||||
bne .Lfailure
|
||||
mov x6, v6.d[0]
|
||||
cmp x6, #136
|
||||
bne .Lfailure
|
||||
|
||||
mov x2, x1
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x2]
|
||||
mov x2, x1
|
||||
ldr q4, [x2], 16
|
||||
ldr q5, [x2], 16
|
||||
ldr q6, [x2], 16
|
||||
ldr q7, [x2]
|
||||
addv b4, v4.16b
|
||||
addv b5, v5.16b
|
||||
addv b6, v6.16b
|
||||
addv b7, v7.16b
|
||||
mov x4, v4.d[0]
|
||||
cmp x4, #136
|
||||
bne .Lfailure
|
||||
mov x5, v5.d[0]
|
||||
cmp x5, #120
|
||||
bne .Lfailure
|
||||
mov x6, v6.d[0]
|
||||
cmp x6, #136
|
||||
bne .Lfailure
|
||||
mov x7, v7.d[0]
|
||||
cmp x7, #120
|
||||
bne .Lfailure
|
||||
|
||||
pass
|
||||
.Lfailure:
|
||||
fail
|
|
@ -7,6 +7,8 @@
|
|||
|
||||
.include "testutils.inc"
|
||||
|
||||
.data
|
||||
.align 4
|
||||
input:
|
||||
.word 0x04030201
|
||||
.word 0x08070605
|
||||
|
@ -26,10 +28,10 @@ output:
|
|||
add x1, x1, :lo12:output
|
||||
|
||||
mov x2, x0
|
||||
ldr q0, [x2], 8
|
||||
ldr q0, [x2], 16
|
||||
ldr q1, [x2]
|
||||
mov x2, x0
|
||||
ldr q2, [x2], 8
|
||||
ldr q2, [x2], 16
|
||||
ldr q3, [x2]
|
||||
|
||||
mov x2, x1
|
||||
|
@ -61,9 +63,9 @@ output:
|
|||
addv b5, v5.16b
|
||||
mov x5, v4.d[0]
|
||||
mov x6, v5.d[0]
|
||||
cmp x5, #136
|
||||
cmp x5, #200
|
||||
bne .Lfailure
|
||||
cmp x6, #8
|
||||
cmp x6, #72
|
||||
bne .Lfailure
|
||||
|
||||
mov x2, x1
|
||||
|
@ -82,11 +84,11 @@ output:
|
|||
mov x4, v4.d[0]
|
||||
mov x5, v5.d[0]
|
||||
mov x6, v6.d[0]
|
||||
cmp x4, #88
|
||||
cmp x4, #120
|
||||
bne .Lfailure
|
||||
cmp x5, #200
|
||||
cmp x5, #8
|
||||
bne .Lfailure
|
||||
cmp x6, #248
|
||||
cmp x6, #24
|
||||
bne .Lfailure
|
||||
|
||||
mov x2, x1
|
||||
|
@ -108,13 +110,13 @@ output:
|
|||
mov x5, v5.d[0]
|
||||
mov x6, v6.d[0]
|
||||
mov x7, v7.d[0]
|
||||
cmp x4, #104
|
||||
cmp x4, #168
|
||||
bne .Lfailure
|
||||
cmp x5, #168
|
||||
cmp x5, #232
|
||||
bne .Lfailure
|
||||
cmp x6, #232
|
||||
cmp x6, #40
|
||||
bne .Lfailure
|
||||
cmp x7, #40
|
||||
cmp x7, #104
|
||||
bne .Lfailure
|
||||
|
||||
pass
|
||||
|
|
|
@ -6,9 +6,6 @@
|
|||
|
||||
.include "testutils.inc"
|
||||
|
||||
.data
|
||||
.align 4
|
||||
|
||||
start
|
||||
|
||||
mov x0, #-2
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
|
||||
.include "testutils.inc"
|
||||
|
||||
.data
|
||||
.align 4
|
||||
input1:
|
||||
.word 0x04030201
|
||||
.word 0x08070605
|
||||
|
|
Loading…
Reference in New Issue