Fix ldn/stn multiple instructions. Fix testcases with unaligned data.

sim/aarch64/
	* simulator.c (vec_load): Add M argument.  Rewrite to iterate over
	registers based on structure size.
	(LD4, LD3, LD2, LD1_2, LD1_3, LD1_4): Pass new arg to vec_load.
	(LD1_1): Replace with call to vec_load.
	(vec_store): Add new M argument.  Rewrite to iterate over registers
	based on structure size.
	(ST4, ST3, ST2, ST1_2, ST1_3, ST1_4): Pass new arg to vec_store.
	(ST1_1): Replace with call to vec_store.

	sim/testsuite/sim/aarch64/
	* fcvtz.s, fstur.s, ldn_single.s, ldnr.s, mla.s, mls.s, uzp.s: Align
	data.
	* sumulh.s: Delete unnecessary data alignment.
	* stn_single.s: Align data.  Fix unaligned ldr insns.  Adjust cmp
	arguments to match change.
	* ldn_multiple.s, stn_multiple.s: New.
This commit is contained in:
Jim Wilson 2017-04-22 16:36:01 -07:00
parent 10f489e576
commit bf1554384b
14 changed files with 454 additions and 202 deletions

View File

@ -1,3 +1,14 @@
2017-04-22 Jim Wilson <jim.wilson@linaro.org>
* simulator.c (vec_load): Add M argument. Rewrite to iterate over
registers based on structure size.
(LD4, LD3, LD2, LD1_2, LD1_3, LD1_4): Pass new arg to vec_load.
(LD1_1): Replace with call to vec_load.
(vec_store): Add new M argument. Rewrite to iterate over registers
based on structure size.
(ST4, ST3, ST2, ST1_2, ST1_3, ST1_4): Pass new arg to vec_store.
(ST1_1): Replace with call to vec_store.
2017-04-08 Jim Wilson <jim.wilson@linaro.org>
* simulator.c (do_vec_FCVTL): New.

View File

@ -11524,310 +11524,224 @@ vec_reg (unsigned v, unsigned o)
return (v + o) & 0x3F;
}
/* Load multiple N-element structures to N consecutive registers. */
/* Load multiple N-element structures to M consecutive registers. */
static void
vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
vec_load (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
{
int all = INSTR (30, 30);
unsigned size = INSTR (11, 10);
unsigned vd = INSTR (4, 0);
unsigned i;
unsigned rpt = (N == M) ? 1 : M;
unsigned selem = N;
unsigned i, j, k;
switch (size)
{
case 0: /* 8-bit operations. */
if (all)
for (i = 0; i < (16 * N); i++)
aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
aarch64_get_mem_u8 (cpu, address + i));
else
for (i = 0; i < (8 * N); i++)
aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
aarch64_get_mem_u8 (cpu, address + i));
for (i = 0; i < rpt; i++)
for (j = 0; j < (8 + (8 * all)); j++)
for (k = 0; k < selem; k++)
{
aarch64_set_vec_u8 (cpu, vec_reg (vd, i + k), j,
aarch64_get_mem_u8 (cpu, address));
address += 1;
}
return;
case 1: /* 16-bit operations. */
if (all)
for (i = 0; i < (8 * N); i++)
aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
aarch64_get_mem_u16 (cpu, address + i * 2));
else
for (i = 0; i < (4 * N); i++)
aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
aarch64_get_mem_u16 (cpu, address + i * 2));
for (i = 0; i < rpt; i++)
for (j = 0; j < (4 + (4 * all)); j++)
for (k = 0; k < selem; k++)
{
aarch64_set_vec_u16 (cpu, vec_reg (vd, i + k), j,
aarch64_get_mem_u16 (cpu, address));
address += 2;
}
return;
case 2: /* 32-bit operations. */
if (all)
for (i = 0; i < (4 * N); i++)
aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
aarch64_get_mem_u32 (cpu, address + i * 4));
else
for (i = 0; i < (2 * N); i++)
aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
aarch64_get_mem_u32 (cpu, address + i * 4));
for (i = 0; i < rpt; i++)
for (j = 0; j < (2 + (2 * all)); j++)
for (k = 0; k < selem; k++)
{
aarch64_set_vec_u32 (cpu, vec_reg (vd, i + k), j,
aarch64_get_mem_u32 (cpu, address));
address += 4;
}
return;
case 3: /* 64-bit operations. */
if (all)
for (i = 0; i < (2 * N); i++)
aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
aarch64_get_mem_u64 (cpu, address + i * 8));
else
for (i = 0; i < N; i++)
aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
aarch64_get_mem_u64 (cpu, address + i * 8));
for (i = 0; i < rpt; i++)
for (j = 0; j < (1 + all); j++)
for (k = 0; k < selem; k++)
{
aarch64_set_vec_u64 (cpu, vec_reg (vd, i + k), j,
aarch64_get_mem_u64 (cpu, address));
address += 8;
}
return;
}
}
/* LD4: load multiple 4-element to four consecutive registers. */
/* Load multiple 4-element structures into four consecutive registers. */
static void
LD4 (sim_cpu *cpu, uint64_t address)
{
vec_load (cpu, address, 4);
vec_load (cpu, address, 4, 4);
}
/* LD3: load multiple 3-element structures to three consecutive registers. */
/* Load multiple 3-element structures into three consecutive registers. */
static void
LD3 (sim_cpu *cpu, uint64_t address)
{
vec_load (cpu, address, 3);
vec_load (cpu, address, 3, 3);
}
/* LD2: load multiple 2-element structures to two consecutive registers. */
/* Load multiple 2-element structures into two consecutive registers. */
static void
LD2 (sim_cpu *cpu, uint64_t address)
{
vec_load (cpu, address, 2);
vec_load (cpu, address, 2, 2);
}
/* Load multiple 1-element structures into one register. */
static void
LD1_1 (sim_cpu *cpu, uint64_t address)
{
int all = INSTR (30, 30);
unsigned size = INSTR (11, 10);
unsigned vd = INSTR (4, 0);
unsigned i;
switch (size)
{
case 0:
/* LD1 {Vd.16b}, addr, #16 */
/* LD1 {Vd.8b}, addr, #8 */
for (i = 0; i < (all ? 16 : 8); i++)
aarch64_set_vec_u8 (cpu, vd, i,
aarch64_get_mem_u8 (cpu, address + i));
return;
case 1:
/* LD1 {Vd.8h}, addr, #16 */
/* LD1 {Vd.4h}, addr, #8 */
for (i = 0; i < (all ? 8 : 4); i++)
aarch64_set_vec_u16 (cpu, vd, i,
aarch64_get_mem_u16 (cpu, address + i * 2));
return;
case 2:
/* LD1 {Vd.4s}, addr, #16 */
/* LD1 {Vd.2s}, addr, #8 */
for (i = 0; i < (all ? 4 : 2); i++)
aarch64_set_vec_u32 (cpu, vd, i,
aarch64_get_mem_u32 (cpu, address + i * 4));
return;
case 3:
/* LD1 {Vd.2d}, addr, #16 */
/* LD1 {Vd.1d}, addr, #8 */
for (i = 0; i < (all ? 2 : 1); i++)
aarch64_set_vec_u64 (cpu, vd, i,
aarch64_get_mem_u64 (cpu, address + i * 8));
return;
}
vec_load (cpu, address, 1, 1);
}
/* Load multiple 1-element structures into two registers. */
static void
LD1_2 (sim_cpu *cpu, uint64_t address)
{
/* FIXME: This algorithm is *exactly* the same as the LD2 version.
So why have two different instructions ? There must be something
wrong somewhere. */
vec_load (cpu, address, 2);
vec_load (cpu, address, 1, 2);
}
/* Load multiple 1-element structures into three registers. */
static void
LD1_3 (sim_cpu *cpu, uint64_t address)
{
/* FIXME: This algorithm is *exactly* the same as the LD3 version.
So why have two different instructions ? There must be something
wrong somewhere. */
vec_load (cpu, address, 3);
vec_load (cpu, address, 1, 3);
}
/* Load multiple 1-element structures into four registers. */
static void
LD1_4 (sim_cpu *cpu, uint64_t address)
{
/* FIXME: This algorithm is *exactly* the same as the LD4 version.
So why have two different instructions ? There must be something
wrong somewhere. */
vec_load (cpu, address, 4);
vec_load (cpu, address, 1, 4);
}
/* Store multiple N-element structures to N consecutive registers. */
/* Store multiple N-element structures from M consecutive registers. */
static void
vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
vec_store (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
{
int all = INSTR (30, 30);
unsigned size = INSTR (11, 10);
unsigned vd = INSTR (4, 0);
unsigned i;
unsigned rpt = (N == M) ? 1 : M;
unsigned selem = N;
unsigned i, j, k;
switch (size)
{
case 0: /* 8-bit operations. */
if (all)
for (i = 0; i < (16 * N); i++)
aarch64_set_mem_u8
(cpu, address + i,
aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
else
for (i = 0; i < (8 * N); i++)
aarch64_set_mem_u8
(cpu, address + i,
aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
for (i = 0; i < rpt; i++)
for (j = 0; j < (8 + (8 * all)); j++)
for (k = 0; k < selem; k++)
{
aarch64_set_mem_u8
(cpu, address,
aarch64_get_vec_u8 (cpu, vec_reg (vd, i + k), j));
address += 1;
}
return;
case 1: /* 16-bit operations. */
if (all)
for (i = 0; i < (8 * N); i++)
aarch64_set_mem_u16
(cpu, address + i * 2,
aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
else
for (i = 0; i < (4 * N); i++)
aarch64_set_mem_u16
(cpu, address + i * 2,
aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
for (i = 0; i < rpt; i++)
for (j = 0; j < (4 + (4 * all)); j++)
for (k = 0; k < selem; k++)
{
aarch64_set_mem_u16
(cpu, address,
aarch64_get_vec_u16 (cpu, vec_reg (vd, i + k), j));
address += 2;
}
return;
case 2: /* 32-bit operations. */
if (all)
for (i = 0; i < (4 * N); i++)
aarch64_set_mem_u32
(cpu, address + i * 4,
aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
else
for (i = 0; i < (2 * N); i++)
aarch64_set_mem_u32
(cpu, address + i * 4,
aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
for (i = 0; i < rpt; i++)
for (j = 0; j < (2 + (2 * all)); j++)
for (k = 0; k < selem; k++)
{
aarch64_set_mem_u32
(cpu, address,
aarch64_get_vec_u32 (cpu, vec_reg (vd, i + k), j));
address += 4;
}
return;
case 3: /* 64-bit operations. */
if (all)
for (i = 0; i < (2 * N); i++)
aarch64_set_mem_u64
(cpu, address + i * 8,
aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
else
for (i = 0; i < N; i++)
aarch64_set_mem_u64
(cpu, address + i * 8,
aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
for (i = 0; i < rpt; i++)
for (j = 0; j < (1 + all); j++)
for (k = 0; k < selem; k++)
{
aarch64_set_mem_u64
(cpu, address,
aarch64_get_vec_u64 (cpu, vec_reg (vd, i + k), j));
address += 8;
}
return;
}
}
/* Store multiple 4-element structure to four consecutive registers. */
/* Store multiple 4-element structure from four consecutive registers. */
static void
ST4 (sim_cpu *cpu, uint64_t address)
{
vec_store (cpu, address, 4);
vec_store (cpu, address, 4, 4);
}
/* Store multiple 3-element structures to three consecutive registers. */
/* Store multiple 3-element structures from three consecutive registers. */
static void
ST3 (sim_cpu *cpu, uint64_t address)
{
vec_store (cpu, address, 3);
vec_store (cpu, address, 3, 3);
}
/* Store multiple 2-element structures to two consecutive registers. */
/* Store multiple 2-element structures from two consecutive registers. */
static void
ST2 (sim_cpu *cpu, uint64_t address)
{
vec_store (cpu, address, 2);
vec_store (cpu, address, 2, 2);
}
/* Store multiple 1-element structures into one register. */
/* Store multiple 1-element structures from one register. */
static void
ST1_1 (sim_cpu *cpu, uint64_t address)
{
int all = INSTR (30, 30);
unsigned size = INSTR (11, 10);
unsigned vd = INSTR (4, 0);
unsigned i;
switch (size)
{
case 0:
for (i = 0; i < (all ? 16 : 8); i++)
aarch64_set_mem_u8 (cpu, address + i,
aarch64_get_vec_u8 (cpu, vd, i));
return;
case 1:
for (i = 0; i < (all ? 8 : 4); i++)
aarch64_set_mem_u16 (cpu, address + i * 2,
aarch64_get_vec_u16 (cpu, vd, i));
return;
case 2:
for (i = 0; i < (all ? 4 : 2); i++)
aarch64_set_mem_u32 (cpu, address + i * 4,
aarch64_get_vec_u32 (cpu, vd, i));
return;
case 3:
for (i = 0; i < (all ? 2 : 1); i++)
aarch64_set_mem_u64 (cpu, address + i * 8,
aarch64_get_vec_u64 (cpu, vd, i));
return;
}
vec_store (cpu, address, 1, 1);
}
/* Store multiple 1-element structures into two registers. */
/* Store multiple 1-element structures from two registers. */
static void
ST1_2 (sim_cpu *cpu, uint64_t address)
{
/* FIXME: This algorithm is *exactly* the same as the ST2 version.
So why have two different instructions ? There must be
something wrong somewhere. */
vec_store (cpu, address, 2);
vec_store (cpu, address, 1, 2);
}
/* Store multiple 1-element structures into three registers. */
/* Store multiple 1-element structures from three registers. */
static void
ST1_3 (sim_cpu *cpu, uint64_t address)
{
/* FIXME: This algorithm is *exactly* the same as the ST3 version.
So why have two different instructions ? There must be
something wrong somewhere. */
vec_store (cpu, address, 3);
vec_store (cpu, address, 1, 3);
}
/* Store multiple 1-element structures into four registers. */
/* Store multiple 1-element structures from four registers. */
static void
ST1_4 (sim_cpu *cpu, uint64_t address)
{
/* FIXME: This algorithm is *exactly* the same as the ST4 version.
So why have two different instructions ? There must be
something wrong somewhere. */
vec_store (cpu, address, 4);
vec_store (cpu, address, 1, 4);
}
#define LDn_STn_SINGLE_LANE_AND_SIZE() \

View File

@ -1,3 +1,12 @@
2017-04-22 Jim Wilson <jim.wilson@linaro.org>
* fcvtz.s, fstur.s, ldn_single.s, ldnr.s, mla.s, mls.s, uzp.s: Align
data.
* sumulh.s: Delete unnecessary data alignment.
* stn_single.s: Align data. Fix unaligned ldr insns. Adjust cmp
arguments to match change.
* ldn_multiple.s, stn_multiple.s: New.
2017-04-08 Jim Wilson <jim.wilson@linaro.org>
* fcvtl.s: New.

View File

@ -8,6 +8,7 @@
# For 64-bit unsigned convert, test values 1.5, LONG_MAX, and ULONG_MAX.
.data
.align 4
fm1p5:
.word 3217031168
fimax:

View File

@ -8,6 +8,7 @@
.include "testutils.inc"
.data
.align 4
fm1:
.word 3212836864
fmax:

View File

@ -0,0 +1,136 @@
# mach: aarch64
# Check the load multiple structure instructions: ld1, ld2, ld3, ld4.
# Check the addressing modes: no offset, post-index immediate offset,
# post-index register offset.
.include "testutils.inc"
.data
.align 4
input:
.word 0x04030201
.word 0x08070605
.word 0x0c0b0a09
.word 0x100f0e0d
.word 0xfcfdfeff
.word 0xf8f9fafb
.word 0xf4f5f6f7
.word 0xf0f1f2f3
start
adrp x0, input
add x0, x0, :lo12:input
mov x2, x0
mov x3, #16
ld1 {v0.16b}, [x2], 16
ld1 {v1.8h}, [x2], x3
addv b4, v0.16b
addv b5, v1.16b
mov x4, v4.d[0]
cmp x4, #136
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #120
bne .Lfailure
mov x2, x0
mov x3, #16
ld2 {v0.8b, v1.8b}, [x2], x3
ld2 {v2.4h, v3.4h}, [x2], 16
addv b4, v0.8b
addv b5, v1.8b
addv b6, v2.8b
addv b7, v3.8b
mov x4, v4.d[0]
cmp x4, #64
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #72
bne .Lfailure
mov x6, v6.d[0]
cmp x6, #196
bne .Lfailure
mov x7, v7.d[0]
cmp x7, #180
bne .Lfailure
mov x2, x0
ld3 {v0.2s, v1.2s, v2.2s}, [x2]
addv b4, v0.8b
addv b5, v1.8b
addv b6, v2.8b
mov x4, v4.d[0]
cmp x4, #68
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #16
bne .Lfailure
mov x6, v6.d[0]
cmp x6, #16
bne .Lfailure
mov x2, x0
ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x2]
addv b4, v0.8b
addv b5, v1.8b
addv b6, v2.8b
addv b7, v3.8b
mov x4, v4.d[0]
cmp x4, #0
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #0
bne .Lfailure
mov x6, v6.d[0]
cmp x6, #0
bne .Lfailure
mov x7, v7.d[0]
cmp x7, #0
bne .Lfailure
mov x2, x0
ld1 {v0.4s, v1.4s}, [x2]
addv b4, v0.16b
addv b5, v1.16b
mov x4, v4.d[0]
cmp x4, #136
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #120
bne .Lfailure
mov x2, x0
ld1 {v0.1d, v1.1d, v2.1d}, [x2]
addv b4, v0.8b
addv b5, v1.8b
addv b6, v2.8b
mov x4, v4.d[0]
cmp x4, #36
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #100
bne .Lfailure
mov x6, v6.d[0]
cmp x6, #220
bne .Lfailure
mov x2, x0
ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x2]
addv b4, v0.8b
addv b5, v1.8b
addv b6, v2.8b
mov x4, v4.d[0]
cmp x4, #36
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #100
bne .Lfailure
mov x6, v6.d[0]
cmp x6, #220
bne .Lfailure
pass
.Lfailure:
fail

View File

@ -7,6 +7,8 @@
.include "testutils.inc"
.data
.align 4
input:
.word 0x04030201
.word 0x08070605

View File

@ -7,6 +7,8 @@
.include "testutils.inc"
.data
.align 4
input:
.word 0x04030201
.word 0x08070605

View File

@ -4,6 +4,8 @@
.include "testutils.inc"
.data
.align 4
input:
.word 0x04030201
.word 0x08070605

View File

@ -4,6 +4,8 @@
.include "testutils.inc"
.data
.align 4
input:
.word 0x04030201
.word 0x08070605

View File

@ -0,0 +1,171 @@
# mach: aarch64
# Check the store multiple structure instructions: st1, st2, st3, st4.
# Check the addressing modes: no offset, post-index immediate offset,
# post-index register offset.
.include "testutils.inc"
.data
.align 4
input:
.word 0x04030201
.word 0x08070605
.word 0x0c0b0a09
.word 0x100f0e0d
.word 0xfcfdfeff
.word 0xf8f9fafb
.word 0xf4f5f6f7
.word 0xf0f1f2f3
output:
.zero 64
start
adrp x0, input
add x0, x0, :lo12:input
adrp x1, output
add x1, x1, :lo12:output
mov x2, x0
ldr q0, [x2], 16
ldr q1, [x2]
mov x2, x0
ldr q2, [x2], 16
ldr q3, [x2]
mov x2, x1
mov x3, #16
st1 {v0.16b}, [x2], 16
st1 {v1.8h}, [x2], x3
mov x2, x1
ldr q4, [x2], 16
ldr q5, [x2]
addv b4, v4.16b
addv b5, v5.16b
mov x4, v4.d[0]
cmp x4, #136
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #120
bne .Lfailure
mov x2, x1
mov x3, #16
st2 {v0.8b, v1.8b}, [x2], 16
st2 {v2.4h, v3.4h}, [x2], x3
mov x2, x1
ldr q4, [x2], 16
ldr q5, [x2]
addv b4, v4.16b
addv b5, v5.16b
mov x4, v4.d[0]
cmp x4, #0
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #0
bne .Lfailure
mov x2, x1
st3 {v0.4s, v1.4s, v2.4s}, [x2]
ldr q4, [x2], 16
ldr q5, [x2], 16
ldr q6, [x2]
addv b4, v4.16b
addv b5, v5.16b
addv b6, v6.16b
mov x4, v4.d[0]
cmp x4, #36
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #0
bne .Lfailure
mov x6, v6.d[0]
cmp x6, #100
bne .Lfailure
mov x2, x1
st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x2]
ldr q4, [x2], 16
ldr q5, [x2], 16
ldr q6, [x2], 16
ldr q7, [x2]
addv b4, v4.16b
addv b5, v5.16b
addv b6, v6.16b
addv b7, v7.16b
mov x4, v4.d[0]
cmp x4, #0
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #0
bne .Lfailure
mov x6, v6.d[0]
cmp x6, #0
bne .Lfailure
mov x7, v7.d[0]
cmp x7, #0
bne .Lfailure
pass
mov x2, x1
st1 {v0.2s, v1.2s}, [x2], 16
st1 {v2.1d, v3.1d}, [x2]
mov x2, x1
ldr q4, [x2], 16
ldr q5, [x2]
addv b4, v4.16b
addv b5, v5.16b
mov x4, v4.d[0]
cmp x4, #0
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #0
bne .Lfailure
mov x2, x1
st1 {v0.2d, v1.2d, v2.2d}, [x2]
mov x2, x1
ldr q4, [x2], 16
ldr q5, [x2], 16
ldr q6, [x2]
addv b4, v4.16b
addv b5, v5.16b
addv b6, v6.16b
mov x4, v4.d[0]
cmp x4, #136
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #120
bne .Lfailure
mov x6, v6.d[0]
cmp x6, #136
bne .Lfailure
mov x2, x1
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x2]
mov x2, x1
ldr q4, [x2], 16
ldr q5, [x2], 16
ldr q6, [x2], 16
ldr q7, [x2]
addv b4, v4.16b
addv b5, v5.16b
addv b6, v6.16b
addv b7, v7.16b
mov x4, v4.d[0]
cmp x4, #136
bne .Lfailure
mov x5, v5.d[0]
cmp x5, #120
bne .Lfailure
mov x6, v6.d[0]
cmp x6, #136
bne .Lfailure
mov x7, v7.d[0]
cmp x7, #120
bne .Lfailure
pass
.Lfailure:
fail

View File

@ -7,6 +7,8 @@
.include "testutils.inc"
.data
.align 4
input:
.word 0x04030201
.word 0x08070605
@ -26,10 +28,10 @@ output:
add x1, x1, :lo12:output
mov x2, x0
ldr q0, [x2], 8
ldr q0, [x2], 16
ldr q1, [x2]
mov x2, x0
ldr q2, [x2], 8
ldr q2, [x2], 16
ldr q3, [x2]
mov x2, x1
@ -61,9 +63,9 @@ output:
addv b5, v5.16b
mov x5, v4.d[0]
mov x6, v5.d[0]
cmp x5, #136
cmp x5, #200
bne .Lfailure
cmp x6, #8
cmp x6, #72
bne .Lfailure
mov x2, x1
@ -82,11 +84,11 @@ output:
mov x4, v4.d[0]
mov x5, v5.d[0]
mov x6, v6.d[0]
cmp x4, #88
cmp x4, #120
bne .Lfailure
cmp x5, #200
cmp x5, #8
bne .Lfailure
cmp x6, #248
cmp x6, #24
bne .Lfailure
mov x2, x1
@ -108,13 +110,13 @@ output:
mov x5, v5.d[0]
mov x6, v6.d[0]
mov x7, v7.d[0]
cmp x4, #104
cmp x4, #168
bne .Lfailure
cmp x5, #168
cmp x5, #232
bne .Lfailure
cmp x6, #232
cmp x6, #40
bne .Lfailure
cmp x7, #40
cmp x7, #104
bne .Lfailure
pass

View File

@ -6,9 +6,6 @@
.include "testutils.inc"
.data
.align 4
start
mov x0, #-2

View File

@ -4,6 +4,8 @@
.include "testutils.inc"
.data
.align 4
input1:
.word 0x04030201
.word 0x08070605