From 3698747c48db871d876a398592c5a23d7580ed4a Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 30 Apr 2020 19:09:39 +0100 Subject: [PATCH] target/arm: Convert Neon 'load single structure to all lanes' to decodetree Convert the Neon "load single structure to all lanes" insns to decodetree. Signed-off-by: Peter Maydell Reviewed-by: Richard Henderson Message-id: 20200430181003.21682-13-peter.maydell@linaro.org --- target/arm/neon-ls.decode | 5 +++ target/arm/translate-neon.inc.c | 73 +++++++++++++++++++++++++++++++++ target/arm/translate.c | 55 +------------------------ 3 files changed, 80 insertions(+), 53 deletions(-) diff --git a/target/arm/neon-ls.decode b/target/arm/neon-ls.decode index dd03d5a37b..f0ab6d2c98 100644 --- a/target/arm/neon-ls.decode +++ b/target/arm/neon-ls.decode @@ -34,3 +34,8 @@ VLDST_multiple 1111 0100 0 . l:1 0 rn:4 .... itype:4 size:2 align:2 rm:4 \ vd=%vd_dp + +# Neon load single element to all lanes + +VLD_all_lanes 1111 0100 1 . 1 0 rn:4 .... 11 n:2 size:2 t:1 a:1 rm:4 \ + vd=%vd_dp diff --git a/target/arm/translate-neon.inc.c b/target/arm/translate-neon.inc.c index 966c0d9201..e60e9559ba 100644 --- a/target/arm/translate-neon.inc.c +++ b/target/arm/translate-neon.inc.c @@ -398,3 +398,76 @@ static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a) gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8); return true; } + +static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a) +{ + /* Neon load single structure to all lanes */ + int reg, stride, vec_size; + int vd = a->vd; + int size = a->size; + int nregs = a->n + 1; + TCGv_i32 addr, tmp; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist */ + if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) { + return false; + } + + if (size == 3) { + if (nregs != 4 || a->a == 0) { + return false; + } + /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */ + size = 2; + } + if (nregs == 1 && a->a == 1 && size == 0) { + return false; + } + if (nregs == 3 && a->a == 1) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + /* + * VLD1 to all lanes: T bit indicates how many Dregs to write. + * VLD2/3/4 to all lanes: T bit indicates register stride. + */ + stride = a->t ? 2 : 1; + vec_size = nregs == 1 ? stride * 8 : 8; + + tmp = tcg_temp_new_i32(); + addr = tcg_temp_new_i32(); + load_reg_var(s, addr, a->rn); + for (reg = 0; reg < nregs; reg++) { + gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), + s->be_data | size); + if ((vd & 1) && vec_size == 16) { + /* + * We cannot write 16 bytes at once because the + * destination is unaligned. + */ + tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0), + 8, 8, tmp); + tcg_gen_gvec_mov(0, neon_reg_offset(vd + 1, 0), + neon_reg_offset(vd, 0), 8, 8); + } else { + tcg_gen_gvec_dup_i32(size, neon_reg_offset(vd, 0), + vec_size, vec_size, tmp); + } + tcg_gen_addi_i32(addr, addr, 1 << size); + vd += stride; + } + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(addr); + + gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs); + + return true; +} diff --git a/target/arm/translate.c b/target/arm/translate.c index be56cbb061..7099274c92 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -3224,7 +3224,6 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn) int size; int reg; int load; - int vec_size; TCGv_i32 addr; TCGv_i32 tmp; @@ -3254,58 +3253,8 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn) } else { size = (insn >> 10) & 3; if (size == 3) { - /* Load single element to all lanes. */ - int a = (insn >> 4) & 1; - if (!load) { - return 1; - } - size = (insn >> 6) & 3; - nregs = ((insn >> 8) & 3) + 1; - - if (size == 3) { - if (nregs != 4 || a == 0) { - return 1; - } - /* For VLD4 size==3 a == 1 means 32 bits at 16 byte alignment */ - size = 2; - } - if (nregs == 1 && a == 1 && size == 0) { - return 1; - } - if (nregs == 3 && a == 1) { - return 1; - } - addr = tcg_temp_new_i32(); - load_reg_var(s, addr, rn); - - /* VLD1 to all lanes: bit 5 indicates how many Dregs to write. - * VLD2/3/4 to all lanes: bit 5 indicates register stride. - */ - stride = (insn & (1 << 5)) ? 2 : 1; - vec_size = nregs == 1 ? stride * 8 : 8; - - tmp = tcg_temp_new_i32(); - for (reg = 0; reg < nregs; reg++) { - gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), - s->be_data | size); - if ((rd & 1) && vec_size == 16) { - /* We cannot write 16 bytes at once because the - * destination is unaligned. - */ - tcg_gen_gvec_dup_i32(size, neon_reg_offset(rd, 0), - 8, 8, tmp); - tcg_gen_gvec_mov(0, neon_reg_offset(rd + 1, 0), - neon_reg_offset(rd, 0), 8, 8); - } else { - tcg_gen_gvec_dup_i32(size, neon_reg_offset(rd, 0), - vec_size, vec_size, tmp); - } - tcg_gen_addi_i32(addr, addr, 1 << size); - rd += stride; - } - tcg_temp_free_i32(tmp); - tcg_temp_free_i32(addr); - stride = (1 << size) * nregs; + /* Load single element to all lanes -- handled by decodetree */ + return 1; } else { /* Single element. */ int idx = (insn >> 4) & 0xf;