From 4d5738222ff0a7f4d56fd4a2971b0605726a8fb2 Mon Sep 17 00:00:00 2001 From: Matheus Kowalczuk Ferst Date: Mon, 13 Jun 2022 14:43:59 +0000 Subject: [PATCH 1/9] tcg/ppc: implement rem[u]_i{32,64} with mod[su][wd] Power ISA v3.0 introduced mod[su][wd] insns that can be used to implement rem[u]_i{32,64}. Signed-off-by: Matheus Ferst Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 22 ++++++++++++++++++++++ tcg/ppc/tcg-target.h | 4 ++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index de4483e43b..1cbd047ab3 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -371,6 +371,8 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct) #define MULHWU XO31( 11) #define DIVW XO31(491) #define DIVWU XO31(459) +#define MODSW XO31(779) +#define MODUW XO31(267) #define CMP XO31( 0) #define CMPL XO31( 32) #define LHBRX XO31(790) @@ -403,6 +405,8 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct) #define MULHDU XO31( 9) #define DIVD XO31(489) #define DIVDU XO31(457) +#define MODSD XO31(777) +#define MODUD XO31(265) #define LBZX XO31( 87) #define LHZX XO31(279) @@ -2806,6 +2810,14 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, tcg_out32(s, DIVWU | TAB(args[0], args[1], args[2])); break; + case INDEX_op_rem_i32: + tcg_out32(s, MODSW | TAB(args[0], args[1], args[2])); + break; + + case INDEX_op_remu_i32: + tcg_out32(s, MODUW | TAB(args[0], args[1], args[2])); + break; + case INDEX_op_shl_i32: if (const_args[2]) { /* Limit immediate shift count lest we create an illegal insn. */ @@ -2947,6 +2959,12 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, case INDEX_op_divu_i64: tcg_out32(s, DIVDU | TAB(args[0], args[1], args[2])); break; + case INDEX_op_rem_i64: + tcg_out32(s, MODSD | TAB(args[0], args[1], args[2])); + break; + case INDEX_op_remu_i64: + tcg_out32(s, MODUD | TAB(args[0], args[1], args[2])); + break; case INDEX_op_qemu_ld_i32: tcg_out_qemu_ld(s, args, false); @@ -3722,6 +3740,8 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_div_i32: case INDEX_op_divu_i32: + case INDEX_op_rem_i32: + case INDEX_op_remu_i32: case INDEX_op_nand_i32: case INDEX_op_nor_i32: case INDEX_op_muluh_i32: @@ -3732,6 +3752,8 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_nor_i64: case INDEX_op_div_i64: case INDEX_op_divu_i64: + case INDEX_op_rem_i64: + case INDEX_op_remu_i64: case INDEX_op_mulsh_i64: case INDEX_op_muluh_i64: return C_O1_I2(r, r, r); diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h index e6cf72503f..b5cd225cfa 100644 --- a/tcg/ppc/tcg-target.h +++ b/tcg/ppc/tcg-target.h @@ -83,7 +83,7 @@ extern bool have_vsx; /* optional instructions */ #define TCG_TARGET_HAS_div_i32 1 -#define TCG_TARGET_HAS_rem_i32 0 +#define TCG_TARGET_HAS_rem_i32 have_isa_3_00 #define TCG_TARGET_HAS_rot_i32 1 #define TCG_TARGET_HAS_ext8s_i32 1 #define TCG_TARGET_HAS_ext16s_i32 1 @@ -117,7 +117,7 @@ extern bool have_vsx; #define TCG_TARGET_HAS_extrl_i64_i32 0 #define TCG_TARGET_HAS_extrh_i64_i32 0 #define TCG_TARGET_HAS_div_i64 1 -#define TCG_TARGET_HAS_rem_i64 0 +#define TCG_TARGET_HAS_rem_i64 have_isa_3_00 #define TCG_TARGET_HAS_rot_i64 1 #define TCG_TARGET_HAS_ext8s_i64 1 #define TCG_TARGET_HAS_ext16s_i64 1 From adb5974dcc5ee7fe122c74fb85d3bae331101ec3 Mon Sep 17 00:00:00 2001 From: Bin Meng Date: Tue, 22 Mar 2022 17:50:04 +0800 Subject: [PATCH 2/9] target/avr: Drop avr_cpu_memory_rw_debug() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPUClass::memory_rw_debug() holds a callback for GDB memory access. If not provided, cpu_memory_rw_debug() is used by the GDB stub. Drop avr_cpu_memory_rw_debug() which does nothing special. Signed-off-by: Bin Meng Reviewed-by: Philippe Mathieu-Daudé Message-Id: <20220322095004.70682-1-bmeng.cn@gmail.com> Signed-off-by: Richard Henderson --- target/avr/cpu.c | 1 - target/avr/cpu.h | 2 -- target/avr/helper.c | 6 ------ 3 files changed, 9 deletions(-) diff --git a/target/avr/cpu.c b/target/avr/cpu.c index 5d70e34dd5..05b992ff73 100644 --- a/target/avr/cpu.c +++ b/target/avr/cpu.c @@ -214,7 +214,6 @@ static void avr_cpu_class_init(ObjectClass *oc, void *data) cc->has_work = avr_cpu_has_work; cc->dump_state = avr_cpu_dump_state; cc->set_pc = avr_cpu_set_pc; - cc->memory_rw_debug = avr_cpu_memory_rw_debug; dc->vmsd = &vms_avr_cpu; cc->sysemu_ops = &avr_sysemu_ops; cc->disas_set_info = avr_cpu_disas_set_info; diff --git a/target/avr/cpu.h b/target/avr/cpu.h index d304f33301..96419c0c2b 100644 --- a/target/avr/cpu.h +++ b/target/avr/cpu.h @@ -184,8 +184,6 @@ void avr_cpu_tcg_init(void); void avr_cpu_list(void); int cpu_avr_exec(CPUState *cpu); -int avr_cpu_memory_rw_debug(CPUState *cs, vaddr address, uint8_t *buf, - int len, bool is_write); enum { TB_FLAGS_FULL_ACCESS = 1, diff --git a/target/avr/helper.c b/target/avr/helper.c index c27f702901..db76452f9a 100644 --- a/target/avr/helper.c +++ b/target/avr/helper.c @@ -93,12 +93,6 @@ void avr_cpu_do_interrupt(CPUState *cs) cs->exception_index = -1; } -int avr_cpu_memory_rw_debug(CPUState *cs, vaddr addr, uint8_t *buf, - int len, bool is_write) -{ - return cpu_memory_rw_debug(cs, addr, buf, len, is_write); -} - hwaddr avr_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) { return addr; /* I assume 1:1 address correspondence */ From a82fd5a4ec24d923ff1e6da128c0fd4a74079d99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Wed, 23 Mar 2022 18:17:43 +0100 Subject: [PATCH 3/9] accel/tcg: Init TCG cflags in vCPU thread handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move TCG cflags initialization to thread handler. Remove the duplicated assert checks. Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20220323171751.78612-6-philippe.mathieu.daude@gmail.com> Signed-off-by: Richard Henderson --- accel/tcg/tcg-accel-ops-mttcg.c | 5 ++--- accel/tcg/tcg-accel-ops-rr.c | 7 +++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c index d50239e0e2..ba997f6cfe 100644 --- a/accel/tcg/tcg-accel-ops-mttcg.c +++ b/accel/tcg/tcg-accel-ops-mttcg.c @@ -70,6 +70,8 @@ static void *mttcg_cpu_thread_fn(void *arg) assert(tcg_enabled()); g_assert(!icount_enabled()); + tcg_cpu_init_cflags(cpu, current_machine->smp.max_cpus > 1); + rcu_register_thread(); force_rcu.notifier.notify = mttcg_force_rcu; force_rcu.cpu = cpu; @@ -139,9 +141,6 @@ void mttcg_start_vcpu_thread(CPUState *cpu) { char thread_name[VCPU_THREAD_NAME_SIZE]; - g_assert(tcg_enabled()); - tcg_cpu_init_cflags(cpu, current_machine->smp.max_cpus > 1); - cpu->thread = g_new0(QemuThread, 1); cpu->halt_cond = g_malloc0(sizeof(QemuCond)); qemu_cond_init(cpu->halt_cond); diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c index 1a72149f0e..cc8adc2380 100644 --- a/accel/tcg/tcg-accel-ops-rr.c +++ b/accel/tcg/tcg-accel-ops-rr.c @@ -152,7 +152,9 @@ static void *rr_cpu_thread_fn(void *arg) Notifier force_rcu; CPUState *cpu = arg; - assert(tcg_enabled()); + g_assert(tcg_enabled()); + tcg_cpu_init_cflags(cpu, false); + rcu_register_thread(); force_rcu.notify = rr_force_rcu; rcu_add_force_rcu_notifier(&force_rcu); @@ -275,9 +277,6 @@ void rr_start_vcpu_thread(CPUState *cpu) static QemuCond *single_tcg_halt_cond; static QemuThread *single_tcg_cpu_thread; - g_assert(tcg_enabled()); - tcg_cpu_init_cflags(cpu, false); - if (!single_tcg_cpu_thread) { cpu->thread = g_new0(QemuThread, 1); cpu->halt_cond = g_new0(QemuCond, 1); From 18b8c47f8e66c1c45a0fb59cfd6ed4dfeb25c6f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Wed, 23 Mar 2022 18:17:44 +0100 Subject: [PATCH 4/9] accel/tcg: Reorganize tcg_accel_ops_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reorg TCG AccelOpsClass initialization to emphasis icount mode share more code with single-threaded TCG. Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson Message-Id: <20220323171751.78612-7-philippe.mathieu.daude@gmail.com> Signed-off-by: Richard Henderson --- accel/tcg/tcg-accel-ops.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/accel/tcg/tcg-accel-ops.c b/accel/tcg/tcg-accel-ops.c index 684dc5a137..786d90c08f 100644 --- a/accel/tcg/tcg-accel-ops.c +++ b/accel/tcg/tcg-accel-ops.c @@ -97,16 +97,17 @@ static void tcg_accel_ops_init(AccelOpsClass *ops) ops->create_vcpu_thread = mttcg_start_vcpu_thread; ops->kick_vcpu_thread = mttcg_kick_vcpu_thread; ops->handle_interrupt = tcg_handle_interrupt; - } else if (icount_enabled()) { - ops->create_vcpu_thread = rr_start_vcpu_thread; - ops->kick_vcpu_thread = rr_kick_vcpu_thread; - ops->handle_interrupt = icount_handle_interrupt; - ops->get_virtual_clock = icount_get; - ops->get_elapsed_ticks = icount_get; } else { ops->create_vcpu_thread = rr_start_vcpu_thread; ops->kick_vcpu_thread = rr_kick_vcpu_thread; - ops->handle_interrupt = tcg_handle_interrupt; + + if (icount_enabled()) { + ops->handle_interrupt = icount_handle_interrupt; + ops->get_virtual_clock = icount_get; + ops->get_elapsed_ticks = icount_get; + } else { + ops->handle_interrupt = tcg_handle_interrupt; + } } } From 3f42906c9ab2c777a895b48b87b8107167e4a275 Mon Sep 17 00:00:00 2001 From: Idan Horowitz Date: Fri, 14 Jan 2022 02:43:58 +0200 Subject: [PATCH 5/9] qemu-timer: Skip empty timer lists before locking in qemu_clock_deadline_ns_all This decreases qemu_clock_deadline_ns_all's share from 23.2% to 13% in a profile of icount-enabled aarch64-softmmu. Signed-off-by: Idan Horowitz Reviewed-by: Richard Henderson Message-Id: <20220114004358.299534-2-idan.horowitz@gmail.com> Signed-off-by: Richard Henderson --- util/qemu-timer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/util/qemu-timer.c b/util/qemu-timer.c index a670a57881..6a0de33dd2 100644 --- a/util/qemu-timer.c +++ b/util/qemu-timer.c @@ -261,6 +261,9 @@ int64_t qemu_clock_deadline_ns_all(QEMUClockType type, int attr_mask) } QLIST_FOREACH(timer_list, &clock->timerlists, list) { + if (!qatomic_read(&timer_list->active_timers)) { + continue; + } qemu_mutex_lock(&timer_list->active_timers_lock); ts = timer_list->active_timers; /* Skip all external timers */ From 418ade7849ce7641c0f7333718caf5091a02fd4c Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 21 Jun 2022 08:38:29 -0700 Subject: [PATCH 6/9] softmmu: Always initialize xlat in address_space_translate_for_iotlb The bug is an uninitialized memory read, along the translate_fail path, which results in garbage being read from iotlb_to_section, which can lead to a crash in io_readx/io_writex. The bug may be fixed by writing any value with zero in ~TARGET_PAGE_MASK, so that the call to iotlb_to_section using the xlat'ed address returns io_mem_unassigned, as desired by the translate_fail path. It is most useful to record the original physical page address, which will eventually be logged by memory_region_access_valid when the access is rejected by unassigned_mem_accepts. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1065 Signed-off-by: Richard Henderson Reviewed-by: Peter Maydell Message-Id: <20220621153829.366423-1-richard.henderson@linaro.org> --- softmmu/physmem.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/softmmu/physmem.c b/softmmu/physmem.c index fb16be57a6..dc3c3e5f2e 100644 --- a/softmmu/physmem.c +++ b/softmmu/physmem.c @@ -669,7 +669,7 @@ void tcg_iommu_init_notifier_list(CPUState *cpu) /* Called from RCU critical section */ MemoryRegionSection * -address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr, +address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr orig_addr, hwaddr *xlat, hwaddr *plen, MemTxAttrs attrs, int *prot) { @@ -678,6 +678,7 @@ address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr, IOMMUMemoryRegionClass *imrc; IOMMUTLBEntry iotlb; int iommu_idx; + hwaddr addr = orig_addr; AddressSpaceDispatch *d = qatomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch); @@ -722,6 +723,16 @@ address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr, return section; translate_fail: + /* + * We should be given a page-aligned address -- certainly + * tlb_set_page_with_attrs() does so. The page offset of xlat + * is used to index sections[], and PHYS_SECTION_UNASSIGNED = 0. + * The page portion of xlat will be logged by memory_region_access_valid() + * when this memory access is rejected, so use the original untranslated + * physical address. + */ + assert((orig_addr & ~TARGET_PAGE_MASK) == 0); + *xlat = orig_addr; return &d->map.sections[PHYS_SECTION_UNASSIGNED]; } From 79713752870c7d730d128b9158edb0c58b82fcf9 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 20 Jun 2022 18:48:35 -0700 Subject: [PATCH 7/9] util: Merge cacheflush.c and cacheinfo.c Combine the two files into cacheflush.c. There's a couple of bits that would be helpful to share between the two, and combining them seems better than exporting the bits. Signed-off-by: Richard Henderson Reviewed-by: Peter Maydell Message-Id: <20220621014837.189139-2-richard.henderson@linaro.org> --- util/cacheflush.c | 202 +++++++++++++++++++++++++++++++++++++++++++++- util/cacheinfo.c | 200 --------------------------------------------- util/meson.build | 2 +- 3 files changed, 202 insertions(+), 202 deletions(-) delete mode 100644 util/cacheinfo.c diff --git a/util/cacheflush.c b/util/cacheflush.c index 4b57186d89..8096afd33c 100644 --- a/util/cacheflush.c +++ b/util/cacheflush.c @@ -1,5 +1,5 @@ /* - * Flush the host cpu caches. + * Info about, and flushing the host cpu caches. * * This work is licensed under the terms of the GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. @@ -9,8 +9,208 @@ #include "qemu/cacheflush.h" #include "qemu/cacheinfo.h" #include "qemu/bitops.h" +#include "qemu/host-utils.h" +#include "qemu/atomic.h" +int qemu_icache_linesize = 0; +int qemu_icache_linesize_log; +int qemu_dcache_linesize = 0; +int qemu_dcache_linesize_log; + +/* + * Operating system specific cache detection mechanisms. + */ + +#if defined(_WIN32) + +static void sys_cache_info(int *isize, int *dsize) +{ + SYSTEM_LOGICAL_PROCESSOR_INFORMATION *buf; + DWORD size = 0; + BOOL success; + size_t i, n; + + /* + * Check for the required buffer size first. Note that if the zero + * size we use for the probe results in success, then there is no + * data available; fail in that case. + */ + success = GetLogicalProcessorInformation(0, &size); + if (success || GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + return; + } + + n = size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + size = n * sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + buf = g_new0(SYSTEM_LOGICAL_PROCESSOR_INFORMATION, n); + if (!GetLogicalProcessorInformation(buf, &size)) { + goto fail; + } + + for (i = 0; i < n; i++) { + if (buf[i].Relationship == RelationCache + && buf[i].Cache.Level == 1) { + switch (buf[i].Cache.Type) { + case CacheUnified: + *isize = *dsize = buf[i].Cache.LineSize; + break; + case CacheInstruction: + *isize = buf[i].Cache.LineSize; + break; + case CacheData: + *dsize = buf[i].Cache.LineSize; + break; + default: + break; + } + } + } + fail: + g_free(buf); +} + +#elif defined(__APPLE__) +# include +static void sys_cache_info(int *isize, int *dsize) +{ + /* There's only a single sysctl for both I/D cache line sizes. */ + long size; + size_t len = sizeof(size); + if (!sysctlbyname("hw.cachelinesize", &size, &len, NULL, 0)) { + *isize = *dsize = size; + } +} +#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +# include +static void sys_cache_info(int *isize, int *dsize) +{ + /* There's only a single sysctl for both I/D cache line sizes. */ + int size; + size_t len = sizeof(size); + if (!sysctlbyname("machdep.cacheline_size", &size, &len, NULL, 0)) { + *isize = *dsize = size; + } +} +#else +/* POSIX */ + +static void sys_cache_info(int *isize, int *dsize) +{ +# ifdef _SC_LEVEL1_ICACHE_LINESIZE + int tmp_isize = (int) sysconf(_SC_LEVEL1_ICACHE_LINESIZE); + if (tmp_isize > 0) { + *isize = tmp_isize; + } +# endif +# ifdef _SC_LEVEL1_DCACHE_LINESIZE + int tmp_dsize = (int) sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + if (tmp_dsize > 0) { + *dsize = tmp_dsize; + } +# endif +} +#endif /* sys_cache_info */ + + +/* + * Architecture (+ OS) specific cache detection mechanisms. + */ + +#if defined(__aarch64__) + +static void arch_cache_info(int *isize, int *dsize) +{ + if (*isize == 0 || *dsize == 0) { + uint64_t ctr; + + /* + * The real cache geometry is in CCSIDR_EL1/CLIDR_EL1/CSSELR_EL1, + * but (at least under Linux) these are marked protected by the + * kernel. However, CTR_EL0 contains the minimum linesize in the + * entire hierarchy, and is used by userspace cache flushing. + */ + asm volatile("mrs\t%0, ctr_el0" : "=r"(ctr)); + if (*isize == 0) { + *isize = 4 << (ctr & 0xf); + } + if (*dsize == 0) { + *dsize = 4 << ((ctr >> 16) & 0xf); + } + } +} + +#elif defined(_ARCH_PPC) && defined(__linux__) +# include "elf.h" + +static void arch_cache_info(int *isize, int *dsize) +{ + if (*isize == 0) { + *isize = qemu_getauxval(AT_ICACHEBSIZE); + } + if (*dsize == 0) { + *dsize = qemu_getauxval(AT_DCACHEBSIZE); + } +} + +#else +static void arch_cache_info(int *isize, int *dsize) { } +#endif /* arch_cache_info */ + +/* + * ... and if all else fails ... + */ + +static void fallback_cache_info(int *isize, int *dsize) +{ + /* If we can only find one of the two, assume they're the same. */ + if (*isize) { + if (*dsize) { + /* Success! */ + } else { + *dsize = *isize; + } + } else if (*dsize) { + *isize = *dsize; + } else { +#if defined(_ARCH_PPC) + /* + * For PPC, we're going to use the cache sizes computed for + * flush_idcache_range. Which means that we must use the + * architecture minimum. + */ + *isize = *dsize = 16; +#else + /* Otherwise, 64 bytes is not uncommon. */ + *isize = *dsize = 64; +#endif + } +} + +static void __attribute__((constructor)) init_cache_info(void) +{ + int isize = 0, dsize = 0; + + sys_cache_info(&isize, &dsize); + arch_cache_info(&isize, &dsize); + fallback_cache_info(&isize, &dsize); + + assert((isize & (isize - 1)) == 0); + assert((dsize & (dsize - 1)) == 0); + + qemu_icache_linesize = isize; + qemu_icache_linesize_log = ctz32(isize); + qemu_dcache_linesize = dsize; + qemu_dcache_linesize_log = ctz32(dsize); + + qatomic64_init(); +} + + +/* + * Architecture (+ OS) specific cache flushing mechanisms. + */ + #if defined(__i386__) || defined(__x86_64__) || defined(__s390__) /* Caches are coherent and do not require flushing; symbol inline. */ diff --git a/util/cacheinfo.c b/util/cacheinfo.c deleted file mode 100644 index ab1644d490..0000000000 --- a/util/cacheinfo.c +++ /dev/null @@ -1,200 +0,0 @@ -/* - * cacheinfo.c - helpers to query the host about its caches - * - * Copyright (C) 2017, Emilio G. Cota - * License: GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - */ - -#include "qemu/osdep.h" -#include "qemu/host-utils.h" -#include "qemu/atomic.h" -#include "qemu/cacheinfo.h" - -int qemu_icache_linesize = 0; -int qemu_icache_linesize_log; -int qemu_dcache_linesize = 0; -int qemu_dcache_linesize_log; - -/* - * Operating system specific detection mechanisms. - */ - -#if defined(_WIN32) - -static void sys_cache_info(int *isize, int *dsize) -{ - SYSTEM_LOGICAL_PROCESSOR_INFORMATION *buf; - DWORD size = 0; - BOOL success; - size_t i, n; - - /* Check for the required buffer size first. Note that if the zero - size we use for the probe results in success, then there is no - data available; fail in that case. */ - success = GetLogicalProcessorInformation(0, &size); - if (success || GetLastError() != ERROR_INSUFFICIENT_BUFFER) { - return; - } - - n = size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); - size = n * sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); - buf = g_new0(SYSTEM_LOGICAL_PROCESSOR_INFORMATION, n); - if (!GetLogicalProcessorInformation(buf, &size)) { - goto fail; - } - - for (i = 0; i < n; i++) { - if (buf[i].Relationship == RelationCache - && buf[i].Cache.Level == 1) { - switch (buf[i].Cache.Type) { - case CacheUnified: - *isize = *dsize = buf[i].Cache.LineSize; - break; - case CacheInstruction: - *isize = buf[i].Cache.LineSize; - break; - case CacheData: - *dsize = buf[i].Cache.LineSize; - break; - default: - break; - } - } - } - fail: - g_free(buf); -} - -#elif defined(__APPLE__) -# include -static void sys_cache_info(int *isize, int *dsize) -{ - /* There's only a single sysctl for both I/D cache line sizes. */ - long size; - size_t len = sizeof(size); - if (!sysctlbyname("hw.cachelinesize", &size, &len, NULL, 0)) { - *isize = *dsize = size; - } -} -#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) -# include -static void sys_cache_info(int *isize, int *dsize) -{ - /* There's only a single sysctl for both I/D cache line sizes. */ - int size; - size_t len = sizeof(size); - if (!sysctlbyname("machdep.cacheline_size", &size, &len, NULL, 0)) { - *isize = *dsize = size; - } -} -#else -/* POSIX */ - -static void sys_cache_info(int *isize, int *dsize) -{ -# ifdef _SC_LEVEL1_ICACHE_LINESIZE - int tmp_isize = (int) sysconf(_SC_LEVEL1_ICACHE_LINESIZE); - if (tmp_isize > 0) { - *isize = tmp_isize; - } -# endif -# ifdef _SC_LEVEL1_DCACHE_LINESIZE - int tmp_dsize = (int) sysconf(_SC_LEVEL1_DCACHE_LINESIZE); - if (tmp_dsize > 0) { - *dsize = tmp_dsize; - } -# endif -} -#endif /* sys_cache_info */ - -/* - * Architecture (+ OS) specific detection mechanisms. - */ - -#if defined(__aarch64__) - -static void arch_cache_info(int *isize, int *dsize) -{ - if (*isize == 0 || *dsize == 0) { - uint64_t ctr; - - /* The real cache geometry is in CCSIDR_EL1/CLIDR_EL1/CSSELR_EL1, - but (at least under Linux) these are marked protected by the - kernel. However, CTR_EL0 contains the minimum linesize in the - entire hierarchy, and is used by userspace cache flushing. */ - asm volatile("mrs\t%0, ctr_el0" : "=r"(ctr)); - if (*isize == 0) { - *isize = 4 << (ctr & 0xf); - } - if (*dsize == 0) { - *dsize = 4 << ((ctr >> 16) & 0xf); - } - } -} - -#elif defined(_ARCH_PPC) && defined(__linux__) -# include "elf.h" - -static void arch_cache_info(int *isize, int *dsize) -{ - if (*isize == 0) { - *isize = qemu_getauxval(AT_ICACHEBSIZE); - } - if (*dsize == 0) { - *dsize = qemu_getauxval(AT_DCACHEBSIZE); - } -} - -#else -static void arch_cache_info(int *isize, int *dsize) { } -#endif /* arch_cache_info */ - -/* - * ... and if all else fails ... - */ - -static void fallback_cache_info(int *isize, int *dsize) -{ - /* If we can only find one of the two, assume they're the same. */ - if (*isize) { - if (*dsize) { - /* Success! */ - } else { - *dsize = *isize; - } - } else if (*dsize) { - *isize = *dsize; - } else { -#if defined(_ARCH_PPC) - /* - * For PPC, we're going to use the cache sizes computed for - * flush_idcache_range. Which means that we must use the - * architecture minimum. - */ - *isize = *dsize = 16; -#else - /* Otherwise, 64 bytes is not uncommon. */ - *isize = *dsize = 64; -#endif - } -} - -static void __attribute__((constructor)) init_cache_info(void) -{ - int isize = 0, dsize = 0; - - sys_cache_info(&isize, &dsize); - arch_cache_info(&isize, &dsize); - fallback_cache_info(&isize, &dsize); - - assert((isize & (isize - 1)) == 0); - assert((dsize & (dsize - 1)) == 0); - - qemu_icache_linesize = isize; - qemu_icache_linesize_log = ctz32(isize); - qemu_dcache_linesize = dsize; - qemu_dcache_linesize_log = ctz32(dsize); - - qatomic64_init(); -} diff --git a/util/meson.build b/util/meson.build index 8f16018cd4..4939b0b91c 100644 --- a/util/meson.build +++ b/util/meson.build @@ -27,7 +27,7 @@ util_ss.add(files('envlist.c', 'path.c', 'module.c')) util_ss.add(files('host-utils.c')) util_ss.add(files('bitmap.c', 'bitops.c')) util_ss.add(files('fifo8.c')) -util_ss.add(files('cacheinfo.c', 'cacheflush.c')) +util_ss.add(files('cacheflush.c')) util_ss.add(files('error.c', 'error-report.c')) util_ss.add(files('qemu-print.c')) util_ss.add(files('id.c')) From bdd50dc7d09c90525b80da4f056b849049893732 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 20 Jun 2022 18:48:36 -0700 Subject: [PATCH 8/9] util/cacheflush: Merge aarch64 ctr_el0 usage Merge init_ctr_el0 into arch_cache_info. In flush_idcache_range, use the pre-computed line sizes from the global variables. Use CONFIG_DARWIN in preference to __APPLE__. Signed-off-by: Richard Henderson Reviewed-by: Peter Maydell Message-Id: <20220621014837.189139-3-richard.henderson@linaro.org> --- util/cacheflush.c | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/util/cacheflush.c b/util/cacheflush.c index 8096afd33c..01b6cb7583 100644 --- a/util/cacheflush.c +++ b/util/cacheflush.c @@ -70,7 +70,7 @@ static void sys_cache_info(int *isize, int *dsize) g_free(buf); } -#elif defined(__APPLE__) +#elif defined(CONFIG_DARWIN) # include static void sys_cache_info(int *isize, int *dsize) { @@ -117,20 +117,25 @@ static void sys_cache_info(int *isize, int *dsize) * Architecture (+ OS) specific cache detection mechanisms. */ -#if defined(__aarch64__) - +#if defined(__aarch64__) && !defined(CONFIG_DARWIN) +/* Apple does not expose CTR_EL0, so we must use system interfaces. */ +static uint64_t save_ctr_el0; static void arch_cache_info(int *isize, int *dsize) { - if (*isize == 0 || *dsize == 0) { - uint64_t ctr; + uint64_t ctr; - /* - * The real cache geometry is in CCSIDR_EL1/CLIDR_EL1/CSSELR_EL1, - * but (at least under Linux) these are marked protected by the - * kernel. However, CTR_EL0 contains the minimum linesize in the - * entire hierarchy, and is used by userspace cache flushing. - */ - asm volatile("mrs\t%0, ctr_el0" : "=r"(ctr)); + /* + * The real cache geometry is in CCSIDR_EL1/CLIDR_EL1/CSSELR_EL1, + * but (at least under Linux) these are marked protected by the + * kernel. However, CTR_EL0 contains the minimum linesize in the + * entire hierarchy, and is used by userspace cache flushing. + * + * We will also use this value in flush_idcache_range. + */ + asm volatile("mrs\t%0, ctr_el0" : "=r"(ctr)); + save_ctr_el0 = ctr; + + if (*isize == 0 || *dsize == 0) { if (*isize == 0) { *isize = 4 << (ctr & 0xf); } @@ -228,17 +233,6 @@ void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) } #else -/* - * TODO: unify this with cacheinfo.c. - * We want to save the whole contents of CTR_EL0, so that we - * have more than the linesize, but also IDC and DIC. - */ -static uint64_t save_ctr_el0; -static void __attribute__((constructor)) init_ctr_el0(void) -{ - asm volatile("mrs\t%0, ctr_el0" : "=r"(save_ctr_el0)); -} - /* * This is a copy of gcc's __aarch64_sync_cache_range, modified * to fit this three-operand interface. @@ -248,8 +242,8 @@ void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) const unsigned CTR_IDC = 1u << 28; const unsigned CTR_DIC = 1u << 29; const uint64_t ctr_el0 = save_ctr_el0; - const uintptr_t icache_lsize = 4 << extract64(ctr_el0, 0, 4); - const uintptr_t dcache_lsize = 4 << extract64(ctr_el0, 16, 4); + const uintptr_t icache_lsize = qemu_icache_linesize; + const uintptr_t dcache_lsize = qemu_dcache_linesize; uintptr_t p; /* From c79a8e840c435bc26a251e34b043318e8b2081db Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 20 Jun 2022 18:48:37 -0700 Subject: [PATCH 9/9] util/cacheflush: Optimize flushing when ppc host has coherent icache On linux, the AT_HWCAP bit PPC_FEATURE_ICACHE_SNOOP indicates that we can use a simplified 3 instruction flush sequence. Signed-off-by: Nicholas Piggin Message-Id: <20220519141131.29839-1-npiggin@gmail.com> [rth: update after merging cacheflush.c and cacheinfo.c] Signed-off-by: Richard Henderson Reviewed-by: Peter Maydell Message-Id: <20220621014837.189139-4-richard.henderson@linaro.org> --- util/cacheflush.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/util/cacheflush.c b/util/cacheflush.c index 01b6cb7583..2c2c73e085 100644 --- a/util/cacheflush.c +++ b/util/cacheflush.c @@ -117,6 +117,10 @@ static void sys_cache_info(int *isize, int *dsize) * Architecture (+ OS) specific cache detection mechanisms. */ +#if defined(__powerpc__) +static bool have_coherent_icache; +#endif + #if defined(__aarch64__) && !defined(CONFIG_DARWIN) /* Apple does not expose CTR_EL0, so we must use system interfaces. */ static uint64_t save_ctr_el0; @@ -156,6 +160,7 @@ static void arch_cache_info(int *isize, int *dsize) if (*dsize == 0) { *dsize = qemu_getauxval(AT_DCACHEBSIZE); } + have_coherent_icache = qemu_getauxval(AT_HWCAP) & PPC_FEATURE_ICACHE_SNOOP; } #else @@ -298,8 +303,24 @@ void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) { uintptr_t p, b, e; - size_t dsize = qemu_dcache_linesize; - size_t isize = qemu_icache_linesize; + size_t dsize, isize; + + /* + * Some processors have coherent caches and support a simplified + * flushing procedure. See + * POWER9 UM, 4.6.2.2 Instruction Cache Block Invalidate (icbi) + * https://ibm.ent.box.com/s/tmklq90ze7aj8f4n32er1mu3sy9u8k3k + */ + if (have_coherent_icache) { + asm volatile ("sync\n\t" + "icbi 0,%0\n\t" + "isync" + : : "r"(rx) : "memory"); + return; + } + + dsize = qemu_dcache_linesize; + isize = qemu_icache_linesize; b = rw & ~(dsize - 1); e = (rw + len + dsize - 1) & ~(dsize - 1);