From 10a85e2c8ab6e004e7f3f1dcfea8cb0bf58fb9fb Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 8 May 2020 08:43:58 -0700 Subject: [PATCH] target/arm: Reuse sve_probe_page for gather loads Reviewed-by: Peter Maydell Signed-off-by: Richard Henderson Message-id: 20200508154359.7494-19-richard.henderson@linaro.org Signed-off-by: Peter Maydell --- target/arm/sve_helper.c | 208 +++++++++++++++++++++------------------- 1 file changed, 109 insertions(+), 99 deletions(-) diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c index ad7e10f1e7..f1870aabc2 100644 --- a/target/arm/sve_helper.c +++ b/target/arm/sve_helper.c @@ -5124,130 +5124,140 @@ static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) return *(uint64_t *)(reg + reg_ofs); } -static void sve_ld1_zs(CPUARMState *env, void *vd, void *vg, void *vm, - target_ulong base, uint32_t desc, uintptr_t ra, - zreg_off_fn *off_fn, sve_ldst1_tlb_fn *tlb_fn) +static inline QEMU_ALWAYS_INLINE +void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, + target_ulong base, uint32_t desc, uintptr_t retaddr, + int esize, int msize, zreg_off_fn *off_fn, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) { const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2); - intptr_t i, oprsz = simd_oprsz(desc); - ARMVectorReg scratch = { }; + const int mmu_idx = cpu_mmu_index(env, false); + const intptr_t reg_max = simd_oprsz(desc); + ARMVectorReg scratch; + intptr_t reg_off; + SVEHostPage info, info2; - for (i = 0; i < oprsz; ) { - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); + memset(&scratch, 0, reg_max); + reg_off = 0; + do { + uint64_t pg = vg[reg_off >> 6]; do { if (likely(pg & 1)) { - target_ulong off = off_fn(vm, i); - tlb_fn(env, &scratch, i, base + (off << scale), ra); + target_ulong addr = base + (off_fn(vm, reg_off) << scale); + target_ulong in_page = -(addr | TARGET_PAGE_MASK); + + sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, + mmu_idx, retaddr); + + if (likely(in_page >= msize)) { + if (unlikely(info.flags & TLB_WATCHPOINT)) { + cpu_check_watchpoint(env_cpu(env), addr, msize, + info.attrs, BP_MEM_READ, retaddr); + } + /* TODO: MTE check */ + host_fn(&scratch, reg_off, info.host); + } else { + /* Element crosses the page boundary. */ + sve_probe_page(&info2, false, env, addr + in_page, 0, + MMU_DATA_LOAD, mmu_idx, retaddr); + if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { + cpu_check_watchpoint(env_cpu(env), addr, + msize, info.attrs, + BP_MEM_READ, retaddr); + } + /* TODO: MTE check */ + tlb_fn(env, &scratch, reg_off, addr, retaddr); + } } - i += 4, pg >>= 4; - } while (i & 15); - } + reg_off += esize; + pg >>= esize; + } while (reg_off & 63); + } while (reg_off < reg_max); /* Wait until all exceptions have been raised to write back. */ - memcpy(vd, &scratch, oprsz); + memcpy(vd, &scratch, reg_max); } -static void sve_ld1_zd(CPUARMState *env, void *vd, void *vg, void *vm, - target_ulong base, uint32_t desc, uintptr_t ra, - zreg_off_fn *off_fn, sve_ldst1_tlb_fn *tlb_fn) -{ - const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2); - intptr_t i, oprsz = simd_oprsz(desc) / 8; - ARMVectorReg scratch = { }; - - for (i = 0; i < oprsz; i++) { - uint8_t pg = *(uint8_t *)(vg + H1(i)); - if (likely(pg & 1)) { - target_ulong off = off_fn(vm, i * 8); - tlb_fn(env, &scratch, i * 8, base + (off << scale), ra); - } - } - - /* Wait until all exceptions have been raised to write back. */ - memcpy(vd, &scratch, oprsz * 8); +#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ +void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ + off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ } -#define DO_LD1_ZPZ_S(MEM, OFS) \ -void QEMU_FLATTEN HELPER(sve_ld##MEM##_##OFS) \ - (CPUARMState *env, void *vd, void *vg, void *vm, \ - target_ulong base, uint32_t desc) \ -{ \ - sve_ld1_zs(env, vd, vg, vm, base, desc, GETPC(), \ - off_##OFS##_s, sve_ld1##MEM##_tlb); \ +#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ +void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ + off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ } -#define DO_LD1_ZPZ_D(MEM, OFS) \ -void QEMU_FLATTEN HELPER(sve_ld##MEM##_##OFS) \ - (CPUARMState *env, void *vd, void *vg, void *vm, \ - target_ulong base, uint32_t desc) \ -{ \ - sve_ld1_zd(env, vd, vg, vm, base, desc, GETPC(), \ - off_##OFS##_d, sve_ld1##MEM##_tlb); \ -} +DO_LD1_ZPZ_S(bsu, zsu, MO_8) +DO_LD1_ZPZ_S(bsu, zss, MO_8) +DO_LD1_ZPZ_D(bdu, zsu, MO_8) +DO_LD1_ZPZ_D(bdu, zss, MO_8) +DO_LD1_ZPZ_D(bdu, zd, MO_8) -DO_LD1_ZPZ_S(bsu, zsu) -DO_LD1_ZPZ_S(bsu, zss) -DO_LD1_ZPZ_D(bdu, zsu) -DO_LD1_ZPZ_D(bdu, zss) -DO_LD1_ZPZ_D(bdu, zd) +DO_LD1_ZPZ_S(bss, zsu, MO_8) +DO_LD1_ZPZ_S(bss, zss, MO_8) +DO_LD1_ZPZ_D(bds, zsu, MO_8) +DO_LD1_ZPZ_D(bds, zss, MO_8) +DO_LD1_ZPZ_D(bds, zd, MO_8) -DO_LD1_ZPZ_S(bss, zsu) -DO_LD1_ZPZ_S(bss, zss) -DO_LD1_ZPZ_D(bds, zsu) -DO_LD1_ZPZ_D(bds, zss) -DO_LD1_ZPZ_D(bds, zd) +DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) +DO_LD1_ZPZ_S(hsu_le, zss, MO_16) +DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) +DO_LD1_ZPZ_D(hdu_le, zss, MO_16) +DO_LD1_ZPZ_D(hdu_le, zd, MO_16) -DO_LD1_ZPZ_S(hsu_le, zsu) -DO_LD1_ZPZ_S(hsu_le, zss) -DO_LD1_ZPZ_D(hdu_le, zsu) -DO_LD1_ZPZ_D(hdu_le, zss) -DO_LD1_ZPZ_D(hdu_le, zd) +DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) +DO_LD1_ZPZ_S(hsu_be, zss, MO_16) +DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) +DO_LD1_ZPZ_D(hdu_be, zss, MO_16) +DO_LD1_ZPZ_D(hdu_be, zd, MO_16) -DO_LD1_ZPZ_S(hsu_be, zsu) -DO_LD1_ZPZ_S(hsu_be, zss) -DO_LD1_ZPZ_D(hdu_be, zsu) -DO_LD1_ZPZ_D(hdu_be, zss) -DO_LD1_ZPZ_D(hdu_be, zd) +DO_LD1_ZPZ_S(hss_le, zsu, MO_16) +DO_LD1_ZPZ_S(hss_le, zss, MO_16) +DO_LD1_ZPZ_D(hds_le, zsu, MO_16) +DO_LD1_ZPZ_D(hds_le, zss, MO_16) +DO_LD1_ZPZ_D(hds_le, zd, MO_16) -DO_LD1_ZPZ_S(hss_le, zsu) -DO_LD1_ZPZ_S(hss_le, zss) -DO_LD1_ZPZ_D(hds_le, zsu) -DO_LD1_ZPZ_D(hds_le, zss) -DO_LD1_ZPZ_D(hds_le, zd) +DO_LD1_ZPZ_S(hss_be, zsu, MO_16) +DO_LD1_ZPZ_S(hss_be, zss, MO_16) +DO_LD1_ZPZ_D(hds_be, zsu, MO_16) +DO_LD1_ZPZ_D(hds_be, zss, MO_16) +DO_LD1_ZPZ_D(hds_be, zd, MO_16) -DO_LD1_ZPZ_S(hss_be, zsu) -DO_LD1_ZPZ_S(hss_be, zss) -DO_LD1_ZPZ_D(hds_be, zsu) -DO_LD1_ZPZ_D(hds_be, zss) -DO_LD1_ZPZ_D(hds_be, zd) +DO_LD1_ZPZ_S(ss_le, zsu, MO_32) +DO_LD1_ZPZ_S(ss_le, zss, MO_32) +DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) +DO_LD1_ZPZ_D(sdu_le, zss, MO_32) +DO_LD1_ZPZ_D(sdu_le, zd, MO_32) -DO_LD1_ZPZ_S(ss_le, zsu) -DO_LD1_ZPZ_S(ss_le, zss) -DO_LD1_ZPZ_D(sdu_le, zsu) -DO_LD1_ZPZ_D(sdu_le, zss) -DO_LD1_ZPZ_D(sdu_le, zd) +DO_LD1_ZPZ_S(ss_be, zsu, MO_32) +DO_LD1_ZPZ_S(ss_be, zss, MO_32) +DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) +DO_LD1_ZPZ_D(sdu_be, zss, MO_32) +DO_LD1_ZPZ_D(sdu_be, zd, MO_32) -DO_LD1_ZPZ_S(ss_be, zsu) -DO_LD1_ZPZ_S(ss_be, zss) -DO_LD1_ZPZ_D(sdu_be, zsu) -DO_LD1_ZPZ_D(sdu_be, zss) -DO_LD1_ZPZ_D(sdu_be, zd) +DO_LD1_ZPZ_D(sds_le, zsu, MO_32) +DO_LD1_ZPZ_D(sds_le, zss, MO_32) +DO_LD1_ZPZ_D(sds_le, zd, MO_32) -DO_LD1_ZPZ_D(sds_le, zsu) -DO_LD1_ZPZ_D(sds_le, zss) -DO_LD1_ZPZ_D(sds_le, zd) +DO_LD1_ZPZ_D(sds_be, zsu, MO_32) +DO_LD1_ZPZ_D(sds_be, zss, MO_32) +DO_LD1_ZPZ_D(sds_be, zd, MO_32) -DO_LD1_ZPZ_D(sds_be, zsu) -DO_LD1_ZPZ_D(sds_be, zss) -DO_LD1_ZPZ_D(sds_be, zd) +DO_LD1_ZPZ_D(dd_le, zsu, MO_64) +DO_LD1_ZPZ_D(dd_le, zss, MO_64) +DO_LD1_ZPZ_D(dd_le, zd, MO_64) -DO_LD1_ZPZ_D(dd_le, zsu) -DO_LD1_ZPZ_D(dd_le, zss) -DO_LD1_ZPZ_D(dd_le, zd) - -DO_LD1_ZPZ_D(dd_be, zsu) -DO_LD1_ZPZ_D(dd_be, zss) -DO_LD1_ZPZ_D(dd_be, zd) +DO_LD1_ZPZ_D(dd_be, zsu, MO_64) +DO_LD1_ZPZ_D(dd_be, zss, MO_64) +DO_LD1_ZPZ_D(dd_be, zd, MO_64) #undef DO_LD1_ZPZ_S #undef DO_LD1_ZPZ_D