From 6a6447fe252e2f1c48d6e8cc1bd36515852e8040 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 28 Apr 2023 11:33:28 +0100 Subject: [PATCH 01/12] softmmu: Tidy dirtylimit_dirty_ring_full_time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop inline marker: let compiler decide. Change return type to uint64_t: this matches the computation in the return statement and the local variable assignment in the caller. Rename local to dirty_ring_size_MB to fix typo. Simplify conversion to MiB via qemu_target_page_bits and right shift. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Thomas Huth Reviewed-by: Juan Quintela Signed-off-by: Richard Henderson --- softmmu/dirtylimit.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c index 82986c1499..71bf6dc7a4 100644 --- a/softmmu/dirtylimit.c +++ b/softmmu/dirtylimit.c @@ -232,18 +232,23 @@ bool dirtylimit_vcpu_index_valid(int cpu_index) cpu_index >= ms->smp.max_cpus); } -static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate) +static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate) { static uint64_t max_dirtyrate; - uint32_t dirty_ring_size = kvm_dirty_ring_size(); - uint64_t dirty_ring_size_meory_MB = - dirty_ring_size * qemu_target_page_size() >> 20; + unsigned target_page_bits = qemu_target_page_bits(); + uint64_t dirty_ring_size_MB; + + /* So far, the largest (non-huge) page size is 64k, i.e. 16 bits. */ + assert(target_page_bits < 20); + + /* Convert ring size (pages) to MiB (2**20). */ + dirty_ring_size_MB = kvm_dirty_ring_size() >> (20 - target_page_bits); if (max_dirtyrate < dirtyrate) { max_dirtyrate = dirtyrate; } - return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate; + return dirty_ring_size_MB * 1000000 / max_dirtyrate; } static inline bool dirtylimit_done(uint64_t quota, From ac01ec6fe59458978b32624a6e93b5f2e55b593f Mon Sep 17 00:00:00 2001 From: Weiwei Li Date: Sat, 22 Apr 2023 21:03:27 +0800 Subject: [PATCH 02/12] accel/tcg: Uncache the host address for instruction fetch when tlb size < 1 When PMP entry overlap part of the page, we'll set the tlb_size to 1, which will make the address in tlb entry set with TLB_INVALID_MASK, and the next access will again go through tlb_fill.However, this way will not work in tb_gen_code() => get_page_addr_code_hostp(): the TLB host address will be cached, and the following instructions can use this host address directly which may lead to the bypass of PMP related check. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1542. Signed-off-by: Weiwei Li Signed-off-by: Junqiang Wang Reviewed-by: LIU Zhiwei Reviewed-by: Richard Henderson Message-Id: <20230422130329.23555-6-liweiwei@iscas.ac.cn> --- accel/tcg/cputlb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index e984a98dc4..efa0cb67c9 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -1696,6 +1696,11 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr, if (p == NULL) { return -1; } + + if (full->lg_page_size < TARGET_PAGE_BITS) { + return -1; + } + if (hostp) { *hostp = p; } From 8841c815a93a2da3e60fb8173d9c5dc01f552dab Mon Sep 17 00:00:00 2001 From: Dickon Hood Date: Fri, 28 Apr 2023 15:47:46 +0100 Subject: [PATCH 03/12] qemu/bitops.h: Limit rotate amounts Rotates have been fixed up to only allow for reasonable rotate amounts (ie, no rotates >7 on an 8b value etc.) This fixes a problem with riscv vector rotate instructions. Signed-off-by: Dickon Hood Reviewed-by: Richard Henderson Message-Id: <20230428144757.57530-9-lawrence.hunter@codethink.co.uk> [rth: Mask shifts in both directions.] Signed-off-by: Richard Henderson --- include/qemu/bitops.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h index 03213ce952..cb3526d1f4 100644 --- a/include/qemu/bitops.h +++ b/include/qemu/bitops.h @@ -218,7 +218,7 @@ static inline unsigned long find_first_zero_bit(const unsigned long *addr, */ static inline uint8_t rol8(uint8_t word, unsigned int shift) { - return (word << shift) | (word >> ((8 - shift) & 7)); + return (word << (shift & 7)) | (word >> (-shift & 7)); } /** @@ -228,7 +228,7 @@ static inline uint8_t rol8(uint8_t word, unsigned int shift) */ static inline uint8_t ror8(uint8_t word, unsigned int shift) { - return (word >> shift) | (word << ((8 - shift) & 7)); + return (word >> (shift & 7)) | (word << (-shift & 7)); } /** @@ -238,7 +238,7 @@ static inline uint8_t ror8(uint8_t word, unsigned int shift) */ static inline uint16_t rol16(uint16_t word, unsigned int shift) { - return (word << shift) | (word >> ((16 - shift) & 15)); + return (word << (shift & 15)) | (word >> (-shift & 15)); } /** @@ -248,7 +248,7 @@ static inline uint16_t rol16(uint16_t word, unsigned int shift) */ static inline uint16_t ror16(uint16_t word, unsigned int shift) { - return (word >> shift) | (word << ((16 - shift) & 15)); + return (word >> (shift & 15)) | (word << (-shift & 15)); } /** @@ -258,7 +258,7 @@ static inline uint16_t ror16(uint16_t word, unsigned int shift) */ static inline uint32_t rol32(uint32_t word, unsigned int shift) { - return (word << shift) | (word >> ((32 - shift) & 31)); + return (word << (shift & 31)) | (word >> (-shift & 31)); } /** @@ -268,7 +268,7 @@ static inline uint32_t rol32(uint32_t word, unsigned int shift) */ static inline uint32_t ror32(uint32_t word, unsigned int shift) { - return (word >> shift) | (word << ((32 - shift) & 31)); + return (word >> (shift & 31)) | (word << (-shift & 31)); } /** @@ -278,7 +278,7 @@ static inline uint32_t ror32(uint32_t word, unsigned int shift) */ static inline uint64_t rol64(uint64_t word, unsigned int shift) { - return (word << shift) | (word >> ((64 - shift) & 63)); + return (word << (shift & 63)) | (word >> (-shift & 63)); } /** @@ -288,7 +288,7 @@ static inline uint64_t rol64(uint64_t word, unsigned int shift) */ static inline uint64_t ror64(uint64_t word, unsigned int shift) { - return (word >> shift) | (word << ((64 - shift) & 63)); + return (word >> (shift & 63)) | (word << (-shift & 63)); } /** From 31fe256d32d70c83232c68c6e2d136b533e1dda0 Mon Sep 17 00:00:00 2001 From: Kiran Ostrolenk Date: Fri, 28 Apr 2023 15:47:48 +0100 Subject: [PATCH 04/12] qemu/host-utils.h: Add clz and ctz functions for lower-bit integers This is for use in the RISC-V vclz and vctz instructions (implemented in proceeding commit). Signed-off-by: Kiran Ostrolenk Reviewed-by: Richard Henderson Message-Id: <20230428144757.57530-11-lawrence.hunter@codethink.co.uk> Signed-off-by: Richard Henderson --- include/qemu/host-utils.h | 54 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h index 3ce62bf4a5..d3b4dce6a9 100644 --- a/include/qemu/host-utils.h +++ b/include/qemu/host-utils.h @@ -107,6 +107,36 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) } #endif +/** + * clz8 - count leading zeros in a 8-bit value. + * @val: The value to search + * + * Returns 8 if the value is zero. Note that the GCC builtin is + * undefined if the value is zero. + * + * Note that the GCC builtin will upcast its argument to an `unsigned int` + * so this function subtracts off the number of prepended zeroes. + */ +static inline int clz8(uint8_t val) +{ + return val ? __builtin_clz(val) - 24 : 8; +} + +/** + * clz16 - count leading zeros in a 16-bit value. + * @val: The value to search + * + * Returns 16 if the value is zero. Note that the GCC builtin is + * undefined if the value is zero. + * + * Note that the GCC builtin will upcast its argument to an `unsigned int` + * so this function subtracts off the number of prepended zeroes. + */ +static inline int clz16(uint16_t val) +{ + return val ? __builtin_clz(val) - 16 : 16; +} + /** * clz32 - count leading zeros in a 32-bit value. * @val: The value to search @@ -153,6 +183,30 @@ static inline int clo64(uint64_t val) return clz64(~val); } +/** + * ctz8 - count trailing zeros in a 8-bit value. + * @val: The value to search + * + * Returns 8 if the value is zero. Note that the GCC builtin is + * undefined if the value is zero. + */ +static inline int ctz8(uint8_t val) +{ + return val ? __builtin_ctz(val) : 8; +} + +/** + * ctz16 - count trailing zeros in a 16-bit value. + * @val: The value to search + * + * Returns 16 if the value is zero. Note that the GCC builtin is + * undefined if the value is zero. + */ +static inline int ctz16(uint16_t val) +{ + return val ? __builtin_ctz(val) : 16; +} + /** * ctz32 - count trailing zeros in a 32-bit value. * @val: The value to search From 4221aa4a882403fc7d6d22ad6af9a58c8a5badf6 Mon Sep 17 00:00:00 2001 From: Nazar Kazakov Date: Fri, 28 Apr 2023 15:47:47 +0100 Subject: [PATCH 05/12] tcg: Add tcg_gen_gvec_andcs Add tcg expander and helper functions for and-compliment vector with scalar operand. Signed-off-by: Nazar Kazakov Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk> [rth: Split out of larger patch.] Signed-off-by: Richard Henderson --- accel/tcg/tcg-runtime-gvec.c | 11 +++++++++++ accel/tcg/tcg-runtime.h | 1 + include/tcg/tcg-op-gvec.h | 2 ++ tcg/tcg-op-gvec.c | 17 +++++++++++++++++ 4 files changed, 31 insertions(+) diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c index ac7d28c251..97399493d5 100644 --- a/accel/tcg/tcg-runtime-gvec.c +++ b/accel/tcg/tcg-runtime-gvec.c @@ -550,6 +550,17 @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc) clear_high(d, oprsz, desc); } +void HELPER(gvec_andcs)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint64_t)) { + *(uint64_t *)(d + i) = *(uint64_t *)(a + i) & ~b; + } + clear_high(d, oprsz, desc); +} + void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc) { intptr_t oprsz = simd_oprsz(desc); diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h index e141a6ab24..b8e6421c8a 100644 --- a/accel/tcg/tcg-runtime.h +++ b/accel/tcg/tcg-runtime.h @@ -217,6 +217,7 @@ DEF_HELPER_FLAGS_4(gvec_nor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(gvec_eqv, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_andcs, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) DEF_HELPER_FLAGS_4(gvec_ors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h index 28cafbcc5c..6d58683171 100644 --- a/include/tcg/tcg-op-gvec.h +++ b/include/tcg/tcg-op-gvec.h @@ -330,6 +330,8 @@ void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c index 047a832f44..9c14908a46 100644 --- a/tcg/tcg-op-gvec.c +++ b/tcg/tcg-op-gvec.c @@ -2761,6 +2761,23 @@ void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); } +void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) +{ + static GVecGen2s g = { + .fni8 = tcg_gen_andc_i64, + .fniv = tcg_gen_andc_vec, + .fno = gen_helper_gvec_andcs, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 + }; + + TCGv_i64 tmp = tcg_temp_ebb_new_i64(); + tcg_gen_dup_i64(vece, tmp, c); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g); + tcg_temp_free_i64(tmp); +} + static const GVecGen2s gop_xors = { .fni8 = tcg_gen_xor_i64, .fniv = tcg_gen_xor_vec, From bef317d0c36d66cb44d14a0838d5abe7b26b0344 Mon Sep 17 00:00:00 2001 From: Nazar Kazakov Date: Mon, 1 May 2023 21:17:22 +0100 Subject: [PATCH 06/12] tcg: Add tcg_gen_gvec_rotrs Add tcg expander and helper functions for rotate right vector with scalar operand. Signed-off-by: Nazar Kazakov Message-Id: <20230428144757.57530-10-lawrence.hunter@codethink.co.uk> [rth: Split out of larger patch; mask rotation count.] Signed-off-by: Richard Henderson --- include/tcg/tcg-op-gvec.h | 2 ++ tcg/tcg-op-gvec.c | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h index 6d58683171..a8183bfeab 100644 --- a/include/tcg/tcg-op-gvec.h +++ b/include/tcg/tcg-op-gvec.h @@ -371,6 +371,8 @@ void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz); void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz); /* * Perform vector shift by vector element, modulo the element size. diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c index 9c14908a46..f51bcaa87b 100644 --- a/tcg/tcg-op-gvec.c +++ b/tcg/tcg-op-gvec.c @@ -3353,6 +3353,17 @@ void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs, do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); } +void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i32 tmp = tcg_temp_ebb_new_i32(); + + tcg_gen_neg_i32(tmp, shift); + tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1); + tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz); + tcg_temp_free_i32(tmp); +} + /* * Expand D = A << (B % element bits) * From c4075353854ed320cf9225ebd5605836ae236640 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 1 May 2023 21:46:25 +0100 Subject: [PATCH 07/12] qemu/int128: Re-shuffle Int128Alias members MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clang 14, with --enable-tcg-interpreter errors with include/qemu/int128.h:487:16: error: alignment of field 'i' (128 bits) does not match the alignment of the first field in transparent union; transparent_union attribute ignored [-Werror,-Wignored-attributes] __int128_t i; ^ include/qemu/int128.h:486:12: note: alignment of first field is 64 bits Int128 s; ^ 1 error generated. By placing the __uint128_t member first, this is avoided. Signed-off-by: Richard Henderson Reviewed-by: Alex Bennée Message-Id: <20230501204625.277361-1-richard.henderson@linaro.org> --- include/qemu/int128.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/qemu/int128.h b/include/qemu/int128.h index f62a46b48c..9e46cfaefc 100644 --- a/include/qemu/int128.h +++ b/include/qemu/int128.h @@ -483,9 +483,9 @@ static inline void bswap128s(Int128 *s) */ #ifdef CONFIG_INT128 typedef union { - Int128 s; - __int128_t i; __uint128_t u; + __int128_t i; + Int128 s; } Int128Alias __attribute__((transparent_union)); #else typedef Int128 Int128Alias; From dc165fcd4effb9e005a4514ab7d666322648e971 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 1 May 2023 22:05:55 +0100 Subject: [PATCH 08/12] migration/xbzrle: Use __attribute__((target)) for avx512 Use the attribute, which is supported by clang, instead of the #pragma, which is not supported and, for some reason, also not detected by the meson probe, so we fail by -Werror. Signed-off-by: Richard Henderson Reviewed-by: Juan Quintela Message-Id: <20230501210555.289806-1-richard.henderson@linaro.org> --- meson.build | 5 +---- migration/xbzrle.c | 9 ++++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/meson.build b/meson.build index c44d05a13f..77d42898c8 100644 --- a/meson.build +++ b/meson.build @@ -2370,12 +2370,9 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \ .require(cc.links(''' - #pragma GCC push_options - #pragma GCC target("avx512bw") #include #include - static int bar(void *a) { - + static int __attribute__((target("avx512bw"))) bar(void *a) { __m512i *x = a; __m512i res= _mm512_abs_epi8(*x); return res[1]; diff --git a/migration/xbzrle.c b/migration/xbzrle.c index c6f8b20917..258e4959c9 100644 --- a/migration/xbzrle.c +++ b/migration/xbzrle.c @@ -177,11 +177,11 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen) } #if defined(CONFIG_AVX512BW_OPT) -#pragma GCC push_options -#pragma GCC target("avx512bw") #include -int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, - uint8_t *dst, int dlen) + +int __attribute__((target("avx512bw"))) +xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, + uint8_t *dst, int dlen) { uint32_t zrun_len = 0, nzrun_len = 0; int d = 0, i = 0, num = 0; @@ -296,5 +296,4 @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, } return d; } -#pragma GCC pop_options #endif From 2899062614ab68ffcd034909b5ea993d8403d6d6 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 12 Apr 2023 13:43:16 +0200 Subject: [PATCH 09/12] accel/tcg: Add cpu_ld*_code_mmu At least RISC-V has the need to be able to perform a read using execute permissions, outside of translation. Add helpers to facilitate this. Signed-off-by: Richard Henderson Acked-by: Alistair Francis Reviewed-by: Weiwei Li Tested-by: Daniel Henrique Barboza Message-Id: <20230325105429.1142530-9-richard.henderson@linaro.org> Message-Id: <20230412114333.118895-9-richard.henderson@linaro.org> --- accel/tcg/cputlb.c | 48 ++++++++++++++++++++++++++++++++++ accel/tcg/user-exec.c | 58 +++++++++++++++++++++++++++++++++++++++++ include/exec/cpu_ldst.h | 9 +++++++ 3 files changed, 115 insertions(+) diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index efa0cb67c9..c8bd642d0e 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -2773,3 +2773,51 @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr) MemOpIdx oi = make_memop_idx(MO_TEUQ, cpu_mmu_index(env, true)); return full_ldq_code(env, addr, oi, 0); } + +uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t retaddr) +{ + return full_ldub_code(env, addr, oi, retaddr); +} + +uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t retaddr) +{ + MemOp mop = get_memop(oi); + int idx = get_mmuidx(oi); + uint16_t ret; + + ret = full_lduw_code(env, addr, make_memop_idx(MO_TEUW, idx), retaddr); + if ((mop & MO_BSWAP) != MO_TE) { + ret = bswap16(ret); + } + return ret; +} + +uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t retaddr) +{ + MemOp mop = get_memop(oi); + int idx = get_mmuidx(oi); + uint32_t ret; + + ret = full_ldl_code(env, addr, make_memop_idx(MO_TEUL, idx), retaddr); + if ((mop & MO_BSWAP) != MO_TE) { + ret = bswap32(ret); + } + return ret; +} + +uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t retaddr) +{ + MemOp mop = get_memop(oi); + int idx = get_mmuidx(oi); + uint64_t ret; + + ret = full_ldq_code(env, addr, make_memop_idx(MO_TEUQ, idx), retaddr); + if ((mop & MO_BSWAP) != MO_TE) { + ret = bswap64(ret); + } + return ret; +} diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c index a7e0c3e2f4..fc597a010d 100644 --- a/accel/tcg/user-exec.c +++ b/accel/tcg/user-exec.c @@ -1219,6 +1219,64 @@ uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr ptr) return ret; } +uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra) +{ + void *haddr; + uint8_t ret; + + haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH); + ret = ldub_p(haddr); + clear_helper_retaddr(); + return ret; +} + +uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra) +{ + void *haddr; + uint16_t ret; + + haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH); + ret = lduw_p(haddr); + clear_helper_retaddr(); + if (get_memop(oi) & MO_BSWAP) { + ret = bswap16(ret); + } + return ret; +} + +uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra) +{ + void *haddr; + uint32_t ret; + + haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_INST_FETCH); + ret = ldl_p(haddr); + clear_helper_retaddr(); + if (get_memop(oi) & MO_BSWAP) { + ret = bswap32(ret); + } + return ret; +} + +uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra) +{ + void *haddr; + uint64_t ret; + + validate_memop(oi, MO_BEUQ); + haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD); + ret = ldq_p(haddr); + clear_helper_retaddr(); + if (get_memop(oi) & MO_BSWAP) { + ret = bswap64(ret); + } + return ret; +} + #include "ldst_common.c.inc" /* diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h index 09b55cc0ee..c141f0394f 100644 --- a/include/exec/cpu_ldst.h +++ b/include/exec/cpu_ldst.h @@ -445,6 +445,15 @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx, # define cpu_stq_mmu cpu_stq_le_mmu #endif +uint8_t cpu_ldb_code_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra); +uint16_t cpu_ldw_code_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra); +uint32_t cpu_ldl_code_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra); +uint64_t cpu_ldq_code_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra); + uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr); uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr); uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr); From 341ac0a05509de103f26e8819b5fecea887f5413 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 5 Apr 2023 18:37:25 -0700 Subject: [PATCH 10/12] tcg/loongarch64: Conditionalize tcg_out_exts_i32_i64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since TCG_TYPE_I32 values are kept sign-extended in registers, via ".w" instructions, we need not extend if the register matches. This is already relied upon by comparisons. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- tcg/loongarch64/tcg-target.c.inc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc index 21c2fc9e98..0940788c6f 100644 --- a/tcg/loongarch64/tcg-target.c.inc +++ b/tcg/loongarch64/tcg-target.c.inc @@ -463,7 +463,9 @@ static void tcg_out_ext32s(TCGContext *s, TCGReg ret, TCGReg arg) static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg) { - tcg_out_ext32s(s, ret, arg); + if (ret != arg) { + tcg_out_ext32s(s, ret, arg); + } } static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg) From c6a98619f7179a3564008ea9c9b3672821a6812f Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 5 Apr 2023 18:40:44 -0700 Subject: [PATCH 11/12] tcg/mips: Conditionalize tcg_out_exts_i32_i64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since TCG_TYPE_I32 values are kept sign-extended in registers, we need not extend if the register matches. This is already relied upon by comparisons. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- tcg/mips/tcg-target.c.inc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc index 346c614354..a83ebe8729 100644 --- a/tcg/mips/tcg-target.c.inc +++ b/tcg/mips/tcg-target.c.inc @@ -582,7 +582,9 @@ static void tcg_out_ext32s(TCGContext *s, TCGReg rd, TCGReg rs) static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg rd, TCGReg rs) { - tcg_out_ext32s(s, rd, rs); + if (rd != rs) { + tcg_out_ext32s(s, rd, rs); + } } static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg rd, TCGReg rs) From 129f1f9ee7df77d367d961b3c25353612d33cd83 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 5 Apr 2023 22:27:03 -0700 Subject: [PATCH 12/12] tcg: Introduce tcg_out_movext2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is common code in most qemu_{ld,st} slow paths, moving two registers when there may be overlap between sources and destinations. At present, this is only used by 32-bit hosts for 64-bit data, but will shortly be used for more than that. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c.inc | 44 ++++++++++--------------- tcg/i386/tcg-target.c.inc | 19 +++++------ tcg/tcg.c | 69 ++++++++++++++++++++++++++++++++++++--- 3 files changed, 90 insertions(+), 42 deletions(-) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index 8d769ca0a2..83c818a58b 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -1545,7 +1545,7 @@ static void add_qemu_ldst_label(TCGContext *s, bool is_ld, MemOpIdx oi, static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) { - TCGReg argreg, datalo, datahi; + TCGReg argreg; MemOpIdx oi = lb->oi; MemOp opc = get_memop(oi); @@ -1565,22 +1565,16 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) /* Use the canonical unsigned helpers and minimize icache usage. */ tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]); - datalo = lb->datalo_reg; - datahi = lb->datahi_reg; if ((opc & MO_SIZE) == MO_64) { - if (datalo != TCG_REG_R1) { - tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0); - tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1); - } else if (datahi != TCG_REG_R0) { - tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1); - tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0); - } else { - tcg_out_mov_reg(s, COND_AL, TCG_REG_TMP, TCG_REG_R0); - tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1); - tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_TMP); - } + TCGMovExtend ext[2] = { + { .dst = lb->datalo_reg, .dst_type = TCG_TYPE_I32, + .src = TCG_REG_R0, .src_type = TCG_TYPE_I32, .src_ext = MO_UL }, + { .dst = lb->datahi_reg, .dst_type = TCG_TYPE_I32, + .src = TCG_REG_R1, .src_type = TCG_TYPE_I32, .src_ext = MO_UL }, + }; + tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP); } else { - tcg_out_movext(s, TCG_TYPE_I32, datalo, + tcg_out_movext(s, TCG_TYPE_I32, lb->datalo_reg, TCG_TYPE_I32, opc & MO_SSIZE, TCG_REG_R0); } @@ -1663,17 +1657,15 @@ static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l) if (TARGET_LONG_BITS == 64) { /* 64-bit target address is aligned into R2:R3. */ - if (l->addrhi_reg != TCG_REG_R2) { - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg); - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg); - } else if (l->addrlo_reg != TCG_REG_R3) { - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, l->addrhi_reg); - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, l->addrlo_reg); - } else { - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, TCG_REG_R2); - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R2, TCG_REG_R3); - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R3, TCG_REG_R1); - } + TCGMovExtend ext[2] = { + { .dst = TCG_REG_R2, .dst_type = TCG_TYPE_I32, + .src = l->addrlo_reg, + .src_type = TCG_TYPE_I32, .src_ext = MO_UL }, + { .dst = TCG_REG_R3, .dst_type = TCG_TYPE_I32, + .src = l->addrhi_reg, + .src_type = TCG_TYPE_I32, .src_ext = MO_UL }, + }; + tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP); } else { tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, l->addrlo_reg); } diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc index c8e2bf537f..caf91a3151 100644 --- a/tcg/i386/tcg-target.c.inc +++ b/tcg/i386/tcg-target.c.inc @@ -1914,7 +1914,6 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) { MemOpIdx oi = l->oi; MemOp opc = get_memop(oi); - TCGReg data_reg; tcg_insn_unit **label_ptr = &l->label_ptr[0]; /* resolve label address */ @@ -1951,18 +1950,16 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]); - data_reg = l->datalo_reg; if (TCG_TARGET_REG_BITS == 32 && (opc & MO_SIZE) == MO_64) { - if (data_reg == TCG_REG_EDX) { - /* xchg %edx, %eax */ - tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0); - tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX); - } else { - tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); - tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX); - } + TCGMovExtend ext[2] = { + { .dst = l->datalo_reg, .dst_type = TCG_TYPE_I32, + .src = TCG_REG_EAX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL }, + { .dst = l->datahi_reg, .dst_type = TCG_TYPE_I32, + .src = TCG_REG_EDX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL }, + }; + tcg_out_movext2(s, &ext[0], &ext[1], -1); } else { - tcg_out_movext(s, l->type, data_reg, + tcg_out_movext(s, l->type, l->datalo_reg, TCG_TYPE_REG, opc & MO_SSIZE, TCG_REG_EAX); } diff --git a/tcg/tcg.c b/tcg/tcg.c index fde5ccc57c..cfd3262a4a 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -115,8 +115,7 @@ static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg); static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg ret, TCGReg arg); static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg ret, TCGReg arg); static void tcg_out_addi_ptr(TCGContext *s, TCGReg, TCGReg, tcg_target_long); -static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2) - __attribute__((unused)); +static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2); static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg); static void tcg_out_goto_tb(TCGContext *s, int which); static void tcg_out_op(TCGContext *s, TCGOpcode opc, @@ -354,6 +353,14 @@ void tcg_raise_tb_overflow(TCGContext *s) siglongjmp(s->jmp_trans, -2); } +typedef struct TCGMovExtend { + TCGReg dst; + TCGReg src; + TCGType dst_type; + TCGType src_type; + MemOp src_ext; +} TCGMovExtend; + /** * tcg_out_movext -- move and extend * @s: tcg context @@ -365,9 +372,8 @@ void tcg_raise_tb_overflow(TCGContext *s) * * Move or extend @src into @dst, depending on @src_ext and the types. */ -static void __attribute__((unused)) -tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst, - TCGType src_type, MemOp src_ext, TCGReg src) +static void tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst, + TCGType src_type, MemOp src_ext, TCGReg src) { switch (src_ext) { case MO_UB: @@ -417,6 +423,59 @@ tcg_out_movext(TCGContext *s, TCGType dst_type, TCGReg dst, } } +/* Minor variations on a theme, using a structure. */ +static void tcg_out_movext1_new_src(TCGContext *s, const TCGMovExtend *i, + TCGReg src) +{ + tcg_out_movext(s, i->dst_type, i->dst, i->src_type, i->src_ext, src); +} + +static void tcg_out_movext1(TCGContext *s, const TCGMovExtend *i) +{ + tcg_out_movext1_new_src(s, i, i->src); +} + +/** + * tcg_out_movext2 -- move and extend two pair + * @s: tcg context + * @i1: first move description + * @i2: second move description + * @scratch: temporary register, or -1 for none + * + * As tcg_out_movext, for both @i1 and @i2, caring for overlap + * between the sources and destinations. + */ + +static void __attribute__((unused)) +tcg_out_movext2(TCGContext *s, const TCGMovExtend *i1, + const TCGMovExtend *i2, int scratch) +{ + TCGReg src1 = i1->src; + TCGReg src2 = i2->src; + + if (i1->dst != src2) { + tcg_out_movext1(s, i1); + tcg_out_movext1(s, i2); + return; + } + if (i2->dst == src1) { + TCGType src1_type = i1->src_type; + TCGType src2_type = i2->src_type; + + if (tcg_out_xchg(s, MAX(src1_type, src2_type), src1, src2)) { + /* The data is now in the correct registers, now extend. */ + src1 = i2->src; + src2 = i1->src; + } else { + tcg_debug_assert(scratch >= 0); + tcg_out_mov(s, src1_type, scratch, src1); + src1 = scratch; + } + } + tcg_out_movext1_new_src(s, i2, src2); + tcg_out_movext1_new_src(s, i1, src1); +} + #define C_PFX1(P, A) P##A #define C_PFX2(P, A, B) P##A##_##B #define C_PFX3(P, A, B, C) P##A##_##B##_##C