target/arm: Promote consecutive memory ops for aa64

For a sequence of loads or stores from a single register,
little-endian operations can be promoted to an 8-byte op.
This can reduce the number of operations by a factor of 8.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20181011205206.3552-5-richard.henderson@linaro.org
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Richard Henderson 2018-10-24 07:50:18 +01:00 committed by Peter Maydell
parent 10e0b33c67
commit 87f9a7f0c8
1 changed files with 40 additions and 26 deletions

View File

@ -1201,25 +1201,23 @@ static void write_vec_element_i32(DisasContext *s, TCGv_i32 tcg_src,
/* Store from vector register to memory */
static void do_vec_st(DisasContext *s, int srcidx, int element,
TCGv_i64 tcg_addr, int size)
TCGv_i64 tcg_addr, int size, TCGMemOp endian)
{
TCGMemOp memop = s->be_data + size;
TCGv_i64 tcg_tmp = tcg_temp_new_i64();
read_vec_element(s, tcg_tmp, srcidx, element, size);
tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
tcg_temp_free_i64(tcg_tmp);
}
/* Load from memory to vector register */
static void do_vec_ld(DisasContext *s, int destidx, int element,
TCGv_i64 tcg_addr, int size)
TCGv_i64 tcg_addr, int size, TCGMemOp endian)
{
TCGMemOp memop = s->be_data + size;
TCGv_i64 tcg_tmp = tcg_temp_new_i64();
tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
write_vec_element(s, tcg_tmp, destidx, element, size);
tcg_temp_free_i64(tcg_tmp);
@ -3019,9 +3017,10 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
bool is_postidx = extract32(insn, 23, 1);
bool is_q = extract32(insn, 30, 1);
TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
TCGMemOp endian = s->be_data;
int ebytes = 1 << size;
int elements = (is_q ? 128 : 64) / (8 << size);
int ebytes; /* bytes per element */
int elements; /* elements per vector */
int rpt; /* num iterations */
int selem; /* structure elements */
int r;
@ -3080,6 +3079,20 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
gen_check_sp_alignment(s);
}
/* For our purposes, bytes are always little-endian. */
if (size == 0) {
endian = MO_LE;
}
/* Consecutive little-endian elements from a single register
* can be promoted to a larger little-endian operation.
*/
if (selem == 1 && endian == MO_LE) {
size = 3;
}
ebytes = 1 << size;
elements = (is_q ? 16 : 8) / ebytes;
tcg_rn = cpu_reg_sp(s, rn);
tcg_addr = tcg_temp_new_i64();
tcg_gen_mov_i64(tcg_addr, tcg_rn);
@ -3088,32 +3101,33 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
for (r = 0; r < rpt; r++) {
int e;
for (e = 0; e < elements; e++) {
int tt = (rt + r) % 32;
int xs;
for (xs = 0; xs < selem; xs++) {
int tt = (rt + r + xs) % 32;
if (is_store) {
do_vec_st(s, tt, e, tcg_addr, size);
do_vec_st(s, tt, e, tcg_addr, size, endian);
} else {
do_vec_ld(s, tt, e, tcg_addr, size);
/* For non-quad operations, setting a slice of the low
* 64 bits of the register clears the high 64 bits (in
* the ARM ARM pseudocode this is implicit in the fact
* that 'rval' is a 64 bit wide variable).
* For quad operations, we might still need to zero the
* high bits of SVE. We optimize by noticing that we only
* need to do this the first time we touch a register.
*/
if (e == 0 && (r == 0 || xs == selem - 1)) {
clear_vec_high(s, is_q, tt);
}
do_vec_ld(s, tt, e, tcg_addr, size, endian);
}
tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
tt = (tt + 1) % 32;
}
}
}
if (!is_store) {
/* For non-quad operations, setting a slice of the low
* 64 bits of the register clears the high 64 bits (in
* the ARM ARM pseudocode this is implicit in the fact
* that 'rval' is a 64 bit wide variable).
* For quad operations, we might still need to zero the
* high bits of SVE.
*/
for (r = 0; r < rpt * selem; r++) {
int tt = (rt + r) % 32;
clear_vec_high(s, is_q, tt);
}
}
if (is_postidx) {
int rm = extract32(insn, 16, 5);
if (rm == 31) {
@ -3234,9 +3248,9 @@ static void disas_ldst_single_struct(DisasContext *s, uint32_t insn)
} else {
/* Load/store one element per register */
if (is_load) {
do_vec_ld(s, rt, index, tcg_addr, scale);
do_vec_ld(s, rt, index, tcg_addr, scale, s->be_data);
} else {
do_vec_st(s, rt, index, tcg_addr, scale);
do_vec_st(s, rt, index, tcg_addr, scale, s->be_data);
}
}
tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);