2003-08-09 01:58:05 +02:00
|
|
|
/*
|
|
|
|
* Software MMU support
|
2007-09-16 23:08:06 +02:00
|
|
|
*
|
2011-09-21 22:00:18 +02:00
|
|
|
* Generate helpers used by TCG for qemu_ld/st ops and code load
|
|
|
|
* functions.
|
|
|
|
*
|
|
|
|
* Included from target op helpers and exec.c.
|
|
|
|
*
|
2003-08-09 01:58:05 +02:00
|
|
|
* Copyright (c) 2003 Fabrice Bellard
|
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
2009-07-16 22:47:01 +02:00
|
|
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
2003-08-09 01:58:05 +02:00
|
|
|
*/
|
2012-12-17 18:20:00 +01:00
|
|
|
#include "qemu/timer.h"
|
2013-11-07 19:55:56 +01:00
|
|
|
#include "exec/address-spaces.h"
|
2012-12-17 18:19:49 +01:00
|
|
|
#include "exec/memory.h"
|
2010-03-29 21:24:00 +02:00
|
|
|
|
2003-08-09 01:58:05 +02:00
|
|
|
#define DATA_SIZE (1 << SHIFT)
|
|
|
|
|
|
|
|
#if DATA_SIZE == 8
|
|
|
|
#define SUFFIX q
|
2013-08-27 20:31:48 +02:00
|
|
|
#define LSUFFIX q
|
2013-08-27 23:09:14 +02:00
|
|
|
#define SDATA_TYPE int64_t
|
2014-01-18 14:47:23 +01:00
|
|
|
#define DATA_TYPE uint64_t
|
2003-08-09 01:58:05 +02:00
|
|
|
#elif DATA_SIZE == 4
|
|
|
|
#define SUFFIX l
|
2013-08-27 20:31:48 +02:00
|
|
|
#define LSUFFIX l
|
2013-08-27 23:09:14 +02:00
|
|
|
#define SDATA_TYPE int32_t
|
2014-01-18 14:47:23 +01:00
|
|
|
#define DATA_TYPE uint32_t
|
2003-08-09 01:58:05 +02:00
|
|
|
#elif DATA_SIZE == 2
|
|
|
|
#define SUFFIX w
|
2013-08-27 20:31:48 +02:00
|
|
|
#define LSUFFIX uw
|
2013-08-27 23:09:14 +02:00
|
|
|
#define SDATA_TYPE int16_t
|
2014-01-18 14:47:23 +01:00
|
|
|
#define DATA_TYPE uint16_t
|
2003-08-09 01:58:05 +02:00
|
|
|
#elif DATA_SIZE == 1
|
|
|
|
#define SUFFIX b
|
2013-08-27 20:31:48 +02:00
|
|
|
#define LSUFFIX ub
|
2013-08-27 23:09:14 +02:00
|
|
|
#define SDATA_TYPE int8_t
|
2014-01-18 14:47:23 +01:00
|
|
|
#define DATA_TYPE uint8_t
|
2003-08-09 01:58:05 +02:00
|
|
|
#else
|
|
|
|
#error unsupported data size
|
|
|
|
#endif
|
|
|
|
|
2013-08-27 23:09:14 +02:00
|
|
|
|
|
|
|
/* For the benefit of TCG generated code, we want to avoid the complication
|
|
|
|
of ABI-specific return type promotion and always return a value extended
|
|
|
|
to the register size of the host. This is tcg_target_long, except in the
|
|
|
|
case of a 32-bit host and 64-bit data, and for that we always have
|
|
|
|
uint64_t. Don't bother with this widened value for SOFTMMU_CODE_ACCESS. */
|
|
|
|
#if defined(SOFTMMU_CODE_ACCESS) || DATA_SIZE == 8
|
|
|
|
# define WORD_TYPE DATA_TYPE
|
|
|
|
# define USUFFIX SUFFIX
|
|
|
|
#else
|
|
|
|
# define WORD_TYPE tcg_target_ulong
|
|
|
|
# define USUFFIX glue(u, SUFFIX)
|
|
|
|
# define SSUFFIX glue(s, SUFFIX)
|
|
|
|
#endif
|
|
|
|
|
2004-10-03 17:07:13 +02:00
|
|
|
#ifdef SOFTMMU_CODE_ACCESS
|
2014-07-07 12:23:56 +02:00
|
|
|
#define READ_ACCESS_TYPE MMU_INST_FETCH
|
2005-11-28 22:19:04 +01:00
|
|
|
#define ADDR_READ addr_code
|
2004-10-03 17:07:13 +02:00
|
|
|
#else
|
2014-07-07 12:23:56 +02:00
|
|
|
#define READ_ACCESS_TYPE MMU_DATA_LOAD
|
2005-11-28 22:19:04 +01:00
|
|
|
#define ADDR_READ addr_read
|
2004-10-03 17:07:13 +02:00
|
|
|
#endif
|
|
|
|
|
2013-09-04 20:45:20 +02:00
|
|
|
#if DATA_SIZE == 8
|
|
|
|
# define BSWAP(X) bswap64(X)
|
|
|
|
#elif DATA_SIZE == 4
|
|
|
|
# define BSWAP(X) bswap32(X)
|
|
|
|
#elif DATA_SIZE == 2
|
|
|
|
# define BSWAP(X) bswap16(X)
|
|
|
|
#else
|
|
|
|
# define BSWAP(X) (X)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef TARGET_WORDS_BIGENDIAN
|
|
|
|
# define TGT_BE(X) (X)
|
|
|
|
# define TGT_LE(X) BSWAP(X)
|
|
|
|
#else
|
|
|
|
# define TGT_BE(X) BSWAP(X)
|
|
|
|
# define TGT_LE(X) (X)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if DATA_SIZE == 1
|
|
|
|
# define helper_le_ld_name glue(glue(helper_ret_ld, USUFFIX), MMUSUFFIX)
|
|
|
|
# define helper_be_ld_name helper_le_ld_name
|
|
|
|
# define helper_le_lds_name glue(glue(helper_ret_ld, SSUFFIX), MMUSUFFIX)
|
|
|
|
# define helper_be_lds_name helper_le_lds_name
|
|
|
|
# define helper_le_st_name glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)
|
|
|
|
# define helper_be_st_name helper_le_st_name
|
|
|
|
#else
|
|
|
|
# define helper_le_ld_name glue(glue(helper_le_ld, USUFFIX), MMUSUFFIX)
|
|
|
|
# define helper_be_ld_name glue(glue(helper_be_ld, USUFFIX), MMUSUFFIX)
|
|
|
|
# define helper_le_lds_name glue(glue(helper_le_ld, SSUFFIX), MMUSUFFIX)
|
|
|
|
# define helper_be_lds_name glue(glue(helper_be_ld, SSUFFIX), MMUSUFFIX)
|
|
|
|
# define helper_le_st_name glue(glue(helper_le_st, SUFFIX), MMUSUFFIX)
|
|
|
|
# define helper_be_st_name glue(glue(helper_be_st, SUFFIX), MMUSUFFIX)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef TARGET_WORDS_BIGENDIAN
|
|
|
|
# define helper_te_ld_name helper_be_ld_name
|
|
|
|
# define helper_te_st_name helper_be_st_name
|
|
|
|
#else
|
|
|
|
# define helper_te_ld_name helper_le_ld_name
|
|
|
|
# define helper_te_st_name helper_le_st_name
|
|
|
|
#endif
|
|
|
|
|
2014-03-28 17:55:24 +01:00
|
|
|
#ifndef SOFTMMU_CODE_ACCESS
|
2012-09-02 17:28:56 +02:00
|
|
|
static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
|
2015-04-26 17:49:23 +02:00
|
|
|
CPUIOTLBEntry *iotlbentry,
|
2008-06-29 03:03:05 +02:00
|
|
|
target_ulong addr,
|
2012-04-09 16:20:20 +02:00
|
|
|
uintptr_t retaddr)
|
2003-08-09 01:58:05 +02:00
|
|
|
{
|
2013-05-24 16:10:39 +02:00
|
|
|
uint64_t val;
|
2013-12-17 04:06:51 +01:00
|
|
|
CPUState *cpu = ENV_GET_CPU(env);
|
2015-04-26 17:49:23 +02:00
|
|
|
hwaddr physaddr = iotlbentry->addr;
|
2016-01-21 15:15:05 +01:00
|
|
|
MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
|
2012-03-08 17:08:35 +01:00
|
|
|
|
2008-06-09 02:20:13 +02:00
|
|
|
physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
|
2013-08-26 03:41:01 +02:00
|
|
|
cpu->mem_io_pc = retaddr;
|
2015-06-24 14:16:26 +02:00
|
|
|
if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
|
2013-09-01 17:21:47 +02:00
|
|
|
cpu_io_recompile(cpu, retaddr);
|
2008-06-29 03:03:05 +02:00
|
|
|
}
|
2003-08-09 01:58:05 +02:00
|
|
|
|
2013-08-26 03:41:01 +02:00
|
|
|
cpu->mem_io_vaddr = addr;
|
2015-04-26 17:49:23 +02:00
|
|
|
memory_region_dispatch_read(mr, physaddr, &val, 1 << SHIFT,
|
2015-04-26 17:49:24 +02:00
|
|
|
iotlbentry->attrs);
|
2013-05-24 16:10:39 +02:00
|
|
|
return val;
|
2003-08-09 01:58:05 +02:00
|
|
|
}
|
2014-03-28 17:55:24 +01:00
|
|
|
#endif
|
2003-08-09 01:58:05 +02:00
|
|
|
|
2015-05-13 18:10:33 +02:00
|
|
|
WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
|
|
|
|
TCGMemOpIdx oi, uintptr_t retaddr)
|
2003-08-09 01:58:05 +02:00
|
|
|
{
|
2015-05-13 18:10:33 +02:00
|
|
|
unsigned mmu_idx = get_mmuidx(oi);
|
2013-07-26 20:29:15 +02:00
|
|
|
int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
|
|
|
|
target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
|
2016-07-14 21:43:06 +02:00
|
|
|
unsigned a_bits = get_alignment_bits(get_memop(oi));
|
2013-07-26 20:29:15 +02:00
|
|
|
uintptr_t haddr;
|
2013-09-04 20:45:20 +02:00
|
|
|
DATA_TYPE res;
|
2007-09-17 10:09:54 +02:00
|
|
|
|
2016-07-14 21:43:06 +02:00
|
|
|
if (addr & ((1 << a_bits) - 1)) {
|
2016-06-23 20:16:46 +02:00
|
|
|
cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
|
|
|
|
mmu_idx, retaddr);
|
|
|
|
}
|
|
|
|
|
2013-07-26 20:29:15 +02:00
|
|
|
/* If the TLB entry is for a different page, reload and try again. */
|
|
|
|
if ((addr & TARGET_PAGE_MASK)
|
|
|
|
!= (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
|
2016-07-06 20:26:52 +02:00
|
|
|
if (!VICTIM_TLB_HIT(ADDR_READ, addr)) {
|
implementing victim TLB for QEMU system emulated TLB
QEMU system mode page table walks are expensive. Taken by running QEMU
qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a
4-level page tables in guest Linux OS takes ~450 X86 instructions on
average.
QEMU system mode TLB is implemented using a directly-mapped hashtable.
This structure suffers from conflict misses. Increasing the
associativity of the TLB may not be the solution to conflict misses as
all the ways may have to be walked in serial.
A victim TLB is a TLB used to hold translations evicted from the
primary TLB upon replacement. The victim TLB lies between the main TLB
and its refill path. Victim TLB is of greater associativity (fully
associative in this patch). It takes longer to lookup the victim TLB,
but its likely better than a full page table walk. The memory
translation path is changed as follows :
Before Victim TLB:
1. Inline TLB lookup
2. Exit code cache on TLB miss.
3. Check for unaligned, IO accesses
4. TLB refill.
5. Do the memory access.
6. Return to code cache.
After Victim TLB:
1. Inline TLB lookup
2. Exit code cache on TLB miss.
3. Check for unaligned, IO accesses
4. Victim TLB lookup.
5. If victim TLB misses, TLB refill
6. Do the memory access.
7. Return to code cache
The advantage is that victim TLB can offer more associativity to a
directly mapped TLB and thus potentially fewer page table walks while
still keeping the time taken to flush within reasonable limits.
However, placing a victim TLB before the refill path increase TLB
refill path as the victim TLB is consulted before the TLB refill. The
performance results demonstrate that the pros outweigh the cons.
some performance results taken on SPECINT2006 train
datasets and kernel boot and qemu configure script on an
Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the
Google Doc link below.
https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing
In summary, victim TLB improves the performance of qemu-system-x86_64 by
11% on average on SPECINT2006, kernelboot and qemu configscript and with
highest improvement of in 26% in 456.hmmer. And victim TLB does not result
in any performance degradation in any of the measured benchmarks. Furthermore,
the implemented victim TLB is architecture independent and is expected to
benefit other architectures in QEMU as well.
Although there are measurement fluctuations, the performance
improvement is very significant and by no means in the range of
noises.
Signed-off-by: Xin Tong <trent.tong@gmail.com>
Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-08-05 03:35:23 +02:00
|
|
|
tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
|
|
|
|
mmu_idx, retaddr);
|
|
|
|
}
|
2013-07-26 20:29:15 +02:00
|
|
|
tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Handle an IO access. */
|
|
|
|
if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
|
2015-04-26 17:49:23 +02:00
|
|
|
CPUIOTLBEntry *iotlbentry;
|
2013-07-26 20:29:15 +02:00
|
|
|
if ((addr & (DATA_SIZE - 1)) != 0) {
|
|
|
|
goto do_unaligned_access;
|
2003-08-09 01:58:05 +02:00
|
|
|
}
|
2015-04-26 17:49:23 +02:00
|
|
|
iotlbentry = &env->iotlb[mmu_idx][index];
|
2013-09-04 20:45:20 +02:00
|
|
|
|
|
|
|
/* ??? Note that the io helpers always read data in the target
|
|
|
|
byte ordering. We should push the LE/BE request down into io. */
|
2015-04-26 17:49:23 +02:00
|
|
|
res = glue(io_read, SUFFIX)(env, iotlbentry, addr, retaddr);
|
2013-09-04 20:45:20 +02:00
|
|
|
res = TGT_LE(res);
|
|
|
|
return res;
|
2013-07-26 20:29:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Handle slow unaligned access (it spans two pages or IO). */
|
|
|
|
if (DATA_SIZE > 1
|
|
|
|
&& unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
|
|
|
|
>= TARGET_PAGE_SIZE)) {
|
|
|
|
target_ulong addr1, addr2;
|
2013-09-04 20:45:20 +02:00
|
|
|
DATA_TYPE res1, res2;
|
2013-07-26 20:29:15 +02:00
|
|
|
unsigned shift;
|
|
|
|
do_unaligned_access:
|
|
|
|
addr1 = addr & ~(DATA_SIZE - 1);
|
|
|
|
addr2 = addr1 + DATA_SIZE;
|
2016-07-26 02:39:16 +02:00
|
|
|
res1 = helper_le_ld_name(env, addr1, oi, retaddr);
|
|
|
|
res2 = helper_le_ld_name(env, addr2, oi, retaddr);
|
2013-07-26 20:29:15 +02:00
|
|
|
shift = (addr & (DATA_SIZE - 1)) * 8;
|
2013-09-04 20:45:20 +02:00
|
|
|
|
|
|
|
/* Little-endian combine. */
|
2013-07-26 20:29:15 +02:00
|
|
|
res = (res1 >> shift) | (res2 << ((DATA_SIZE * 8) - shift));
|
2013-09-04 20:45:20 +02:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
haddr = addr + env->tlb_table[mmu_idx][index].addend;
|
|
|
|
#if DATA_SIZE == 1
|
|
|
|
res = glue(glue(ld, LSUFFIX), _p)((uint8_t *)haddr);
|
|
|
|
#else
|
|
|
|
res = glue(glue(ld, LSUFFIX), _le_p)((uint8_t *)haddr);
|
|
|
|
#endif
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
#if DATA_SIZE > 1
|
2015-05-13 18:10:33 +02:00
|
|
|
WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
|
|
|
|
TCGMemOpIdx oi, uintptr_t retaddr)
|
2013-09-04 20:45:20 +02:00
|
|
|
{
|
2015-05-13 18:10:33 +02:00
|
|
|
unsigned mmu_idx = get_mmuidx(oi);
|
2013-09-04 20:45:20 +02:00
|
|
|
int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
|
|
|
|
target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
|
2016-07-14 21:43:06 +02:00
|
|
|
unsigned a_bits = get_alignment_bits(get_memop(oi));
|
2013-09-04 20:45:20 +02:00
|
|
|
uintptr_t haddr;
|
|
|
|
DATA_TYPE res;
|
|
|
|
|
2016-07-14 21:43:06 +02:00
|
|
|
if (addr & ((1 << a_bits) - 1)) {
|
2016-06-23 20:16:46 +02:00
|
|
|
cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
|
|
|
|
mmu_idx, retaddr);
|
|
|
|
}
|
|
|
|
|
2013-09-04 20:45:20 +02:00
|
|
|
/* If the TLB entry is for a different page, reload and try again. */
|
|
|
|
if ((addr & TARGET_PAGE_MASK)
|
|
|
|
!= (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
|
2016-07-06 20:26:52 +02:00
|
|
|
if (!VICTIM_TLB_HIT(ADDR_READ, addr)) {
|
implementing victim TLB for QEMU system emulated TLB
QEMU system mode page table walks are expensive. Taken by running QEMU
qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a
4-level page tables in guest Linux OS takes ~450 X86 instructions on
average.
QEMU system mode TLB is implemented using a directly-mapped hashtable.
This structure suffers from conflict misses. Increasing the
associativity of the TLB may not be the solution to conflict misses as
all the ways may have to be walked in serial.
A victim TLB is a TLB used to hold translations evicted from the
primary TLB upon replacement. The victim TLB lies between the main TLB
and its refill path. Victim TLB is of greater associativity (fully
associative in this patch). It takes longer to lookup the victim TLB,
but its likely better than a full page table walk. The memory
translation path is changed as follows :
Before Victim TLB:
1. Inline TLB lookup
2. Exit code cache on TLB miss.
3. Check for unaligned, IO accesses
4. TLB refill.
5. Do the memory access.
6. Return to code cache.
After Victim TLB:
1. Inline TLB lookup
2. Exit code cache on TLB miss.
3. Check for unaligned, IO accesses
4. Victim TLB lookup.
5. If victim TLB misses, TLB refill
6. Do the memory access.
7. Return to code cache
The advantage is that victim TLB can offer more associativity to a
directly mapped TLB and thus potentially fewer page table walks while
still keeping the time taken to flush within reasonable limits.
However, placing a victim TLB before the refill path increase TLB
refill path as the victim TLB is consulted before the TLB refill. The
performance results demonstrate that the pros outweigh the cons.
some performance results taken on SPECINT2006 train
datasets and kernel boot and qemu configure script on an
Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the
Google Doc link below.
https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing
In summary, victim TLB improves the performance of qemu-system-x86_64 by
11% on average on SPECINT2006, kernelboot and qemu configscript and with
highest improvement of in 26% in 456.hmmer. And victim TLB does not result
in any performance degradation in any of the measured benchmarks. Furthermore,
the implemented victim TLB is architecture independent and is expected to
benefit other architectures in QEMU as well.
Although there are measurement fluctuations, the performance
improvement is very significant and by no means in the range of
noises.
Signed-off-by: Xin Tong <trent.tong@gmail.com>
Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-08-05 03:35:23 +02:00
|
|
|
tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
|
|
|
|
mmu_idx, retaddr);
|
|
|
|
}
|
2013-09-04 20:45:20 +02:00
|
|
|
tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Handle an IO access. */
|
|
|
|
if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
|
2015-04-26 17:49:23 +02:00
|
|
|
CPUIOTLBEntry *iotlbentry;
|
2013-09-04 20:45:20 +02:00
|
|
|
if ((addr & (DATA_SIZE - 1)) != 0) {
|
|
|
|
goto do_unaligned_access;
|
|
|
|
}
|
2015-04-26 17:49:23 +02:00
|
|
|
iotlbentry = &env->iotlb[mmu_idx][index];
|
2013-09-04 20:45:20 +02:00
|
|
|
|
|
|
|
/* ??? Note that the io helpers always read data in the target
|
|
|
|
byte ordering. We should push the LE/BE request down into io. */
|
2015-04-26 17:49:23 +02:00
|
|
|
res = glue(io_read, SUFFIX)(env, iotlbentry, addr, retaddr);
|
2013-09-04 20:45:20 +02:00
|
|
|
res = TGT_BE(res);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Handle slow unaligned access (it spans two pages or IO). */
|
|
|
|
if (DATA_SIZE > 1
|
|
|
|
&& unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
|
|
|
|
>= TARGET_PAGE_SIZE)) {
|
|
|
|
target_ulong addr1, addr2;
|
|
|
|
DATA_TYPE res1, res2;
|
|
|
|
unsigned shift;
|
|
|
|
do_unaligned_access:
|
|
|
|
addr1 = addr & ~(DATA_SIZE - 1);
|
|
|
|
addr2 = addr1 + DATA_SIZE;
|
2016-07-26 02:39:16 +02:00
|
|
|
res1 = helper_be_ld_name(env, addr1, oi, retaddr);
|
|
|
|
res2 = helper_be_ld_name(env, addr2, oi, retaddr);
|
2013-09-04 20:45:20 +02:00
|
|
|
shift = (addr & (DATA_SIZE - 1)) * 8;
|
|
|
|
|
|
|
|
/* Big-endian combine. */
|
|
|
|
res = (res1 << shift) | (res2 >> ((DATA_SIZE * 8) - shift));
|
2013-07-26 20:29:15 +02:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
haddr = addr + env->tlb_table[mmu_idx][index].addend;
|
2013-09-04 20:45:20 +02:00
|
|
|
res = glue(glue(ld, LSUFFIX), _be_p)((uint8_t *)haddr);
|
|
|
|
return res;
|
2003-08-09 01:58:05 +02:00
|
|
|
}
|
2013-09-04 20:45:20 +02:00
|
|
|
#endif /* DATA_SIZE > 1 */
|
2003-08-09 01:58:05 +02:00
|
|
|
|
2004-10-03 17:07:13 +02:00
|
|
|
#ifndef SOFTMMU_CODE_ACCESS
|
|
|
|
|
2013-08-27 23:09:14 +02:00
|
|
|
/* Provide signed versions of the load routines as well. We can of course
|
|
|
|
avoid this for 64-bit data, or for 32-bit data on 32-bit host. */
|
|
|
|
#if DATA_SIZE * 8 < TCG_TARGET_REG_BITS
|
2013-09-04 20:45:20 +02:00
|
|
|
WORD_TYPE helper_le_lds_name(CPUArchState *env, target_ulong addr,
|
2015-05-13 18:10:33 +02:00
|
|
|
TCGMemOpIdx oi, uintptr_t retaddr)
|
2013-09-04 20:45:20 +02:00
|
|
|
{
|
2015-05-13 18:10:33 +02:00
|
|
|
return (SDATA_TYPE)helper_le_ld_name(env, addr, oi, retaddr);
|
2013-09-04 20:45:20 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
# if DATA_SIZE > 1
|
|
|
|
WORD_TYPE helper_be_lds_name(CPUArchState *env, target_ulong addr,
|
2015-05-13 18:10:33 +02:00
|
|
|
TCGMemOpIdx oi, uintptr_t retaddr)
|
2013-08-27 23:09:14 +02:00
|
|
|
{
|
2015-05-13 18:10:33 +02:00
|
|
|
return (SDATA_TYPE)helper_be_ld_name(env, addr, oi, retaddr);
|
2013-08-27 23:09:14 +02:00
|
|
|
}
|
2013-09-04 20:45:20 +02:00
|
|
|
# endif
|
2013-08-27 23:09:14 +02:00
|
|
|
#endif
|
|
|
|
|
2012-09-02 17:28:56 +02:00
|
|
|
static inline void glue(io_write, SUFFIX)(CPUArchState *env,
|
2015-04-26 17:49:23 +02:00
|
|
|
CPUIOTLBEntry *iotlbentry,
|
2004-10-03 17:07:13 +02:00
|
|
|
DATA_TYPE val,
|
2008-06-09 02:20:13 +02:00
|
|
|
target_ulong addr,
|
2012-04-09 16:20:20 +02:00
|
|
|
uintptr_t retaddr)
|
2004-10-03 17:07:13 +02:00
|
|
|
{
|
2013-12-17 04:06:51 +01:00
|
|
|
CPUState *cpu = ENV_GET_CPU(env);
|
2015-04-26 17:49:23 +02:00
|
|
|
hwaddr physaddr = iotlbentry->addr;
|
2016-01-21 15:15:05 +01:00
|
|
|
MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
|
2012-03-08 17:08:35 +01:00
|
|
|
|
2008-06-09 02:20:13 +02:00
|
|
|
physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
|
2015-06-24 14:16:26 +02:00
|
|
|
if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
|
2013-09-01 17:21:47 +02:00
|
|
|
cpu_io_recompile(cpu, retaddr);
|
2008-06-29 03:03:05 +02:00
|
|
|
}
|
2004-10-03 17:07:13 +02:00
|
|
|
|
2013-08-26 03:41:01 +02:00
|
|
|
cpu->mem_io_vaddr = addr;
|
|
|
|
cpu->mem_io_pc = retaddr;
|
2015-04-26 17:49:23 +02:00
|
|
|
memory_region_dispatch_write(mr, physaddr, val, 1 << SHIFT,
|
2015-04-26 17:49:24 +02:00
|
|
|
iotlbentry->attrs);
|
2004-10-03 17:07:13 +02:00
|
|
|
}
|
2003-08-09 01:58:05 +02:00
|
|
|
|
2013-09-04 20:45:20 +02:00
|
|
|
void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
|
2015-05-13 18:10:33 +02:00
|
|
|
TCGMemOpIdx oi, uintptr_t retaddr)
|
2003-08-09 01:58:05 +02:00
|
|
|
{
|
2015-05-13 18:10:33 +02:00
|
|
|
unsigned mmu_idx = get_mmuidx(oi);
|
2013-07-26 20:29:15 +02:00
|
|
|
int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
|
|
|
|
target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
|
2016-07-14 21:43:06 +02:00
|
|
|
unsigned a_bits = get_alignment_bits(get_memop(oi));
|
2013-07-26 20:29:15 +02:00
|
|
|
uintptr_t haddr;
|
2007-09-17 10:09:54 +02:00
|
|
|
|
2016-07-14 21:43:06 +02:00
|
|
|
if (addr & ((1 << a_bits) - 1)) {
|
2016-06-23 20:16:46 +02:00
|
|
|
cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
|
|
|
|
mmu_idx, retaddr);
|
|
|
|
}
|
|
|
|
|
2013-07-26 20:29:15 +02:00
|
|
|
/* If the TLB entry is for a different page, reload and try again. */
|
|
|
|
if ((addr & TARGET_PAGE_MASK)
|
|
|
|
!= (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
|
2016-07-06 20:26:52 +02:00
|
|
|
if (!VICTIM_TLB_HIT(addr_write, addr)) {
|
2014-07-07 12:23:56 +02:00
|
|
|
tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
|
implementing victim TLB for QEMU system emulated TLB
QEMU system mode page table walks are expensive. Taken by running QEMU
qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a
4-level page tables in guest Linux OS takes ~450 X86 instructions on
average.
QEMU system mode TLB is implemented using a directly-mapped hashtable.
This structure suffers from conflict misses. Increasing the
associativity of the TLB may not be the solution to conflict misses as
all the ways may have to be walked in serial.
A victim TLB is a TLB used to hold translations evicted from the
primary TLB upon replacement. The victim TLB lies between the main TLB
and its refill path. Victim TLB is of greater associativity (fully
associative in this patch). It takes longer to lookup the victim TLB,
but its likely better than a full page table walk. The memory
translation path is changed as follows :
Before Victim TLB:
1. Inline TLB lookup
2. Exit code cache on TLB miss.
3. Check for unaligned, IO accesses
4. TLB refill.
5. Do the memory access.
6. Return to code cache.
After Victim TLB:
1. Inline TLB lookup
2. Exit code cache on TLB miss.
3. Check for unaligned, IO accesses
4. Victim TLB lookup.
5. If victim TLB misses, TLB refill
6. Do the memory access.
7. Return to code cache
The advantage is that victim TLB can offer more associativity to a
directly mapped TLB and thus potentially fewer page table walks while
still keeping the time taken to flush within reasonable limits.
However, placing a victim TLB before the refill path increase TLB
refill path as the victim TLB is consulted before the TLB refill. The
performance results demonstrate that the pros outweigh the cons.
some performance results taken on SPECINT2006 train
datasets and kernel boot and qemu configure script on an
Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the
Google Doc link below.
https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing
In summary, victim TLB improves the performance of qemu-system-x86_64 by
11% on average on SPECINT2006, kernelboot and qemu configscript and with
highest improvement of in 26% in 456.hmmer. And victim TLB does not result
in any performance degradation in any of the measured benchmarks. Furthermore,
the implemented victim TLB is architecture independent and is expected to
benefit other architectures in QEMU as well.
Although there are measurement fluctuations, the performance
improvement is very significant and by no means in the range of
noises.
Signed-off-by: Xin Tong <trent.tong@gmail.com>
Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-08-05 03:35:23 +02:00
|
|
|
}
|
2013-07-26 20:29:15 +02:00
|
|
|
tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Handle an IO access. */
|
|
|
|
if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
|
2015-04-26 17:49:23 +02:00
|
|
|
CPUIOTLBEntry *iotlbentry;
|
2013-07-26 20:29:15 +02:00
|
|
|
if ((addr & (DATA_SIZE - 1)) != 0) {
|
|
|
|
goto do_unaligned_access;
|
|
|
|
}
|
2015-04-26 17:49:23 +02:00
|
|
|
iotlbentry = &env->iotlb[mmu_idx][index];
|
2013-09-04 20:45:20 +02:00
|
|
|
|
|
|
|
/* ??? Note that the io helpers always read data in the target
|
|
|
|
byte ordering. We should push the LE/BE request down into io. */
|
|
|
|
val = TGT_LE(val);
|
2015-04-26 17:49:23 +02:00
|
|
|
glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
|
2013-07-26 20:29:15 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Handle slow unaligned access (it spans two pages or IO). */
|
|
|
|
if (DATA_SIZE > 1
|
|
|
|
&& unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
|
|
|
|
>= TARGET_PAGE_SIZE)) {
|
cputlb: Fix for self-modifying writes across page boundaries
As it currently stands, QEMU does not properly handle self-modifying code
when the write is unaligned and crosses a page boundary. The procedure
for handling a write to the current translation block is to write-protect
the current translation block, catch the write, split up the translation
block into the current instruction (which remains write-protected so that
the current instruction is not modified) and the remaining instructions
in the translation block, and then restore the CPU state to before the
write occurred so the write will be retried and successfully executed.
However, since unaligned writes across pages are split into one-byte
writes for simplicity, writes to the second page (which is not the
current TB) may succeed before a write to the current TB is attempted,
and since these writes are not invalidated before resuming state after
splitting the TB, these writes will be performed a second time, thus
corrupting the second page. Credit goes to Patrick Hulin for
discovering this.
In recent 64-bit versions of Windows running in emulated mode, this
results in either being very unstable (a BSOD after a couple minutes of
uptime), or being entirely unable to boot. Windows performs one or more
8-byte unaligned self-modifying writes (xors) which intersect the end
of the current TB and the beginning of the next TB, which runs into the
aforementioned issue. This commit fixes that issue by making the
unaligned write loop perform the writes in forwards order, instead of
reverse order. This way, QEMU immediately tries to write to the current
TB, and splits the TB before any write to the second page is executed.
The write then proceeds as intended. With this patch applied, I am able
to boot and use Windows 7 64-bit and Windows 10 64-bit in QEMU without
KVM.
Per Richard Henderson's input, this patch also ensures the second page
is in the TLB before executing the write loop, to ensure the second
page is mapped.
The original discussion of the issue is located at
http://lists.nongnu.org/archive/html/qemu-devel/2014-08/msg02161.html.
Signed-off-by: Samuel Damashek <samuel.damashek@invincea.com>
Message-Id: <20160706182652.16190-1-samuel.damashek@invincea.com>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-07-08 21:54:34 +02:00
|
|
|
int i, index2;
|
|
|
|
target_ulong page2, tlb_addr2;
|
2013-07-26 20:29:15 +02:00
|
|
|
do_unaligned_access:
|
cputlb: Fix for self-modifying writes across page boundaries
As it currently stands, QEMU does not properly handle self-modifying code
when the write is unaligned and crosses a page boundary. The procedure
for handling a write to the current translation block is to write-protect
the current translation block, catch the write, split up the translation
block into the current instruction (which remains write-protected so that
the current instruction is not modified) and the remaining instructions
in the translation block, and then restore the CPU state to before the
write occurred so the write will be retried and successfully executed.
However, since unaligned writes across pages are split into one-byte
writes for simplicity, writes to the second page (which is not the
current TB) may succeed before a write to the current TB is attempted,
and since these writes are not invalidated before resuming state after
splitting the TB, these writes will be performed a second time, thus
corrupting the second page. Credit goes to Patrick Hulin for
discovering this.
In recent 64-bit versions of Windows running in emulated mode, this
results in either being very unstable (a BSOD after a couple minutes of
uptime), or being entirely unable to boot. Windows performs one or more
8-byte unaligned self-modifying writes (xors) which intersect the end
of the current TB and the beginning of the next TB, which runs into the
aforementioned issue. This commit fixes that issue by making the
unaligned write loop perform the writes in forwards order, instead of
reverse order. This way, QEMU immediately tries to write to the current
TB, and splits the TB before any write to the second page is executed.
The write then proceeds as intended. With this patch applied, I am able
to boot and use Windows 7 64-bit and Windows 10 64-bit in QEMU without
KVM.
Per Richard Henderson's input, this patch also ensures the second page
is in the TLB before executing the write loop, to ensure the second
page is mapped.
The original discussion of the issue is located at
http://lists.nongnu.org/archive/html/qemu-devel/2014-08/msg02161.html.
Signed-off-by: Samuel Damashek <samuel.damashek@invincea.com>
Message-Id: <20160706182652.16190-1-samuel.damashek@invincea.com>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-07-08 21:54:34 +02:00
|
|
|
/* Ensure the second page is in the TLB. Note that the first page
|
|
|
|
is already guaranteed to be filled, and that the second page
|
|
|
|
cannot evict the first. */
|
|
|
|
page2 = (addr + DATA_SIZE) & TARGET_PAGE_MASK;
|
|
|
|
index2 = (page2 >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
|
|
|
|
tlb_addr2 = env->tlb_table[mmu_idx][index2].addr_write;
|
|
|
|
if (page2 != (tlb_addr2 & (TARGET_PAGE_MASK | TLB_INVALID_MASK))
|
|
|
|
&& !VICTIM_TLB_HIT(addr_write, page2)) {
|
|
|
|
tlb_fill(ENV_GET_CPU(env), page2, MMU_DATA_STORE,
|
|
|
|
mmu_idx, retaddr);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* XXX: not efficient, but simple. */
|
|
|
|
/* This loop must go in the forward direction to avoid issues
|
|
|
|
with self-modifying code in Windows 64-bit. */
|
|
|
|
for (i = 0; i < DATA_SIZE; ++i) {
|
2013-09-04 20:45:20 +02:00
|
|
|
/* Little-endian extract. */
|
2013-07-26 20:29:15 +02:00
|
|
|
uint8_t val8 = val >> (i * 8);
|
2013-09-04 20:45:20 +02:00
|
|
|
glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
|
2016-07-26 02:39:16 +02:00
|
|
|
oi, retaddr);
|
2013-09-04 20:45:20 +02:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
haddr = addr + env->tlb_table[mmu_idx][index].addend;
|
|
|
|
#if DATA_SIZE == 1
|
|
|
|
glue(glue(st, SUFFIX), _p)((uint8_t *)haddr, val);
|
|
|
|
#else
|
|
|
|
glue(glue(st, SUFFIX), _le_p)((uint8_t *)haddr, val);
|
2005-12-05 20:57:57 +01:00
|
|
|
#endif
|
2013-09-04 20:45:20 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#if DATA_SIZE > 1
|
|
|
|
void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
|
2015-05-13 18:10:33 +02:00
|
|
|
TCGMemOpIdx oi, uintptr_t retaddr)
|
2013-09-04 20:45:20 +02:00
|
|
|
{
|
2015-05-13 18:10:33 +02:00
|
|
|
unsigned mmu_idx = get_mmuidx(oi);
|
2013-09-04 20:45:20 +02:00
|
|
|
int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
|
|
|
|
target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
|
2016-07-14 21:43:06 +02:00
|
|
|
unsigned a_bits = get_alignment_bits(get_memop(oi));
|
2013-09-04 20:45:20 +02:00
|
|
|
uintptr_t haddr;
|
|
|
|
|
2016-07-14 21:43:06 +02:00
|
|
|
if (addr & ((1 << a_bits) - 1)) {
|
2016-06-23 20:16:46 +02:00
|
|
|
cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
|
|
|
|
mmu_idx, retaddr);
|
|
|
|
}
|
|
|
|
|
2013-09-04 20:45:20 +02:00
|
|
|
/* If the TLB entry is for a different page, reload and try again. */
|
|
|
|
if ((addr & TARGET_PAGE_MASK)
|
|
|
|
!= (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
|
2016-07-06 20:26:52 +02:00
|
|
|
if (!VICTIM_TLB_HIT(addr_write, addr)) {
|
2014-07-07 12:23:56 +02:00
|
|
|
tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
|
implementing victim TLB for QEMU system emulated TLB
QEMU system mode page table walks are expensive. Taken by running QEMU
qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a
4-level page tables in guest Linux OS takes ~450 X86 instructions on
average.
QEMU system mode TLB is implemented using a directly-mapped hashtable.
This structure suffers from conflict misses. Increasing the
associativity of the TLB may not be the solution to conflict misses as
all the ways may have to be walked in serial.
A victim TLB is a TLB used to hold translations evicted from the
primary TLB upon replacement. The victim TLB lies between the main TLB
and its refill path. Victim TLB is of greater associativity (fully
associative in this patch). It takes longer to lookup the victim TLB,
but its likely better than a full page table walk. The memory
translation path is changed as follows :
Before Victim TLB:
1. Inline TLB lookup
2. Exit code cache on TLB miss.
3. Check for unaligned, IO accesses
4. TLB refill.
5. Do the memory access.
6. Return to code cache.
After Victim TLB:
1. Inline TLB lookup
2. Exit code cache on TLB miss.
3. Check for unaligned, IO accesses
4. Victim TLB lookup.
5. If victim TLB misses, TLB refill
6. Do the memory access.
7. Return to code cache
The advantage is that victim TLB can offer more associativity to a
directly mapped TLB and thus potentially fewer page table walks while
still keeping the time taken to flush within reasonable limits.
However, placing a victim TLB before the refill path increase TLB
refill path as the victim TLB is consulted before the TLB refill. The
performance results demonstrate that the pros outweigh the cons.
some performance results taken on SPECINT2006 train
datasets and kernel boot and qemu configure script on an
Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the
Google Doc link below.
https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing
In summary, victim TLB improves the performance of qemu-system-x86_64 by
11% on average on SPECINT2006, kernelboot and qemu configscript and with
highest improvement of in 26% in 456.hmmer. And victim TLB does not result
in any performance degradation in any of the measured benchmarks. Furthermore,
the implemented victim TLB is architecture independent and is expected to
benefit other architectures in QEMU as well.
Although there are measurement fluctuations, the performance
improvement is very significant and by no means in the range of
noises.
Signed-off-by: Xin Tong <trent.tong@gmail.com>
Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-08-05 03:35:23 +02:00
|
|
|
}
|
2013-09-04 20:45:20 +02:00
|
|
|
tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Handle an IO access. */
|
|
|
|
if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
|
2015-04-26 17:49:23 +02:00
|
|
|
CPUIOTLBEntry *iotlbentry;
|
2013-09-04 20:45:20 +02:00
|
|
|
if ((addr & (DATA_SIZE - 1)) != 0) {
|
|
|
|
goto do_unaligned_access;
|
|
|
|
}
|
2015-04-26 17:49:23 +02:00
|
|
|
iotlbentry = &env->iotlb[mmu_idx][index];
|
2013-09-04 20:45:20 +02:00
|
|
|
|
|
|
|
/* ??? Note that the io helpers always read data in the target
|
|
|
|
byte ordering. We should push the LE/BE request down into io. */
|
|
|
|
val = TGT_BE(val);
|
2015-04-26 17:49:23 +02:00
|
|
|
glue(io_write, SUFFIX)(env, iotlbentry, val, addr, retaddr);
|
2013-09-04 20:45:20 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Handle slow unaligned access (it spans two pages or IO). */
|
|
|
|
if (DATA_SIZE > 1
|
|
|
|
&& unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
|
|
|
|
>= TARGET_PAGE_SIZE)) {
|
cputlb: Fix for self-modifying writes across page boundaries
As it currently stands, QEMU does not properly handle self-modifying code
when the write is unaligned and crosses a page boundary. The procedure
for handling a write to the current translation block is to write-protect
the current translation block, catch the write, split up the translation
block into the current instruction (which remains write-protected so that
the current instruction is not modified) and the remaining instructions
in the translation block, and then restore the CPU state to before the
write occurred so the write will be retried and successfully executed.
However, since unaligned writes across pages are split into one-byte
writes for simplicity, writes to the second page (which is not the
current TB) may succeed before a write to the current TB is attempted,
and since these writes are not invalidated before resuming state after
splitting the TB, these writes will be performed a second time, thus
corrupting the second page. Credit goes to Patrick Hulin for
discovering this.
In recent 64-bit versions of Windows running in emulated mode, this
results in either being very unstable (a BSOD after a couple minutes of
uptime), or being entirely unable to boot. Windows performs one or more
8-byte unaligned self-modifying writes (xors) which intersect the end
of the current TB and the beginning of the next TB, which runs into the
aforementioned issue. This commit fixes that issue by making the
unaligned write loop perform the writes in forwards order, instead of
reverse order. This way, QEMU immediately tries to write to the current
TB, and splits the TB before any write to the second page is executed.
The write then proceeds as intended. With this patch applied, I am able
to boot and use Windows 7 64-bit and Windows 10 64-bit in QEMU without
KVM.
Per Richard Henderson's input, this patch also ensures the second page
is in the TLB before executing the write loop, to ensure the second
page is mapped.
The original discussion of the issue is located at
http://lists.nongnu.org/archive/html/qemu-devel/2014-08/msg02161.html.
Signed-off-by: Samuel Damashek <samuel.damashek@invincea.com>
Message-Id: <20160706182652.16190-1-samuel.damashek@invincea.com>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-07-08 21:54:34 +02:00
|
|
|
int i, index2;
|
|
|
|
target_ulong page2, tlb_addr2;
|
2013-09-04 20:45:20 +02:00
|
|
|
do_unaligned_access:
|
cputlb: Fix for self-modifying writes across page boundaries
As it currently stands, QEMU does not properly handle self-modifying code
when the write is unaligned and crosses a page boundary. The procedure
for handling a write to the current translation block is to write-protect
the current translation block, catch the write, split up the translation
block into the current instruction (which remains write-protected so that
the current instruction is not modified) and the remaining instructions
in the translation block, and then restore the CPU state to before the
write occurred so the write will be retried and successfully executed.
However, since unaligned writes across pages are split into one-byte
writes for simplicity, writes to the second page (which is not the
current TB) may succeed before a write to the current TB is attempted,
and since these writes are not invalidated before resuming state after
splitting the TB, these writes will be performed a second time, thus
corrupting the second page. Credit goes to Patrick Hulin for
discovering this.
In recent 64-bit versions of Windows running in emulated mode, this
results in either being very unstable (a BSOD after a couple minutes of
uptime), or being entirely unable to boot. Windows performs one or more
8-byte unaligned self-modifying writes (xors) which intersect the end
of the current TB and the beginning of the next TB, which runs into the
aforementioned issue. This commit fixes that issue by making the
unaligned write loop perform the writes in forwards order, instead of
reverse order. This way, QEMU immediately tries to write to the current
TB, and splits the TB before any write to the second page is executed.
The write then proceeds as intended. With this patch applied, I am able
to boot and use Windows 7 64-bit and Windows 10 64-bit in QEMU without
KVM.
Per Richard Henderson's input, this patch also ensures the second page
is in the TLB before executing the write loop, to ensure the second
page is mapped.
The original discussion of the issue is located at
http://lists.nongnu.org/archive/html/qemu-devel/2014-08/msg02161.html.
Signed-off-by: Samuel Damashek <samuel.damashek@invincea.com>
Message-Id: <20160706182652.16190-1-samuel.damashek@invincea.com>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-07-08 21:54:34 +02:00
|
|
|
/* Ensure the second page is in the TLB. Note that the first page
|
|
|
|
is already guaranteed to be filled, and that the second page
|
|
|
|
cannot evict the first. */
|
|
|
|
page2 = (addr + DATA_SIZE) & TARGET_PAGE_MASK;
|
|
|
|
index2 = (page2 >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
|
|
|
|
tlb_addr2 = env->tlb_table[mmu_idx][index2].addr_write;
|
|
|
|
if (page2 != (tlb_addr2 & (TARGET_PAGE_MASK | TLB_INVALID_MASK))
|
|
|
|
&& !VICTIM_TLB_HIT(addr_write, page2)) {
|
|
|
|
tlb_fill(ENV_GET_CPU(env), page2, MMU_DATA_STORE,
|
|
|
|
mmu_idx, retaddr);
|
|
|
|
}
|
|
|
|
|
2013-09-04 20:45:20 +02:00
|
|
|
/* XXX: not efficient, but simple */
|
cputlb: Fix for self-modifying writes across page boundaries
As it currently stands, QEMU does not properly handle self-modifying code
when the write is unaligned and crosses a page boundary. The procedure
for handling a write to the current translation block is to write-protect
the current translation block, catch the write, split up the translation
block into the current instruction (which remains write-protected so that
the current instruction is not modified) and the remaining instructions
in the translation block, and then restore the CPU state to before the
write occurred so the write will be retried and successfully executed.
However, since unaligned writes across pages are split into one-byte
writes for simplicity, writes to the second page (which is not the
current TB) may succeed before a write to the current TB is attempted,
and since these writes are not invalidated before resuming state after
splitting the TB, these writes will be performed a second time, thus
corrupting the second page. Credit goes to Patrick Hulin for
discovering this.
In recent 64-bit versions of Windows running in emulated mode, this
results in either being very unstable (a BSOD after a couple minutes of
uptime), or being entirely unable to boot. Windows performs one or more
8-byte unaligned self-modifying writes (xors) which intersect the end
of the current TB and the beginning of the next TB, which runs into the
aforementioned issue. This commit fixes that issue by making the
unaligned write loop perform the writes in forwards order, instead of
reverse order. This way, QEMU immediately tries to write to the current
TB, and splits the TB before any write to the second page is executed.
The write then proceeds as intended. With this patch applied, I am able
to boot and use Windows 7 64-bit and Windows 10 64-bit in QEMU without
KVM.
Per Richard Henderson's input, this patch also ensures the second page
is in the TLB before executing the write loop, to ensure the second
page is mapped.
The original discussion of the issue is located at
http://lists.nongnu.org/archive/html/qemu-devel/2014-08/msg02161.html.
Signed-off-by: Samuel Damashek <samuel.damashek@invincea.com>
Message-Id: <20160706182652.16190-1-samuel.damashek@invincea.com>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-07-08 21:54:34 +02:00
|
|
|
/* This loop must go in the forward direction to avoid issues
|
|
|
|
with self-modifying code. */
|
|
|
|
for (i = 0; i < DATA_SIZE; ++i) {
|
2013-09-04 20:45:20 +02:00
|
|
|
/* Big-endian extract. */
|
|
|
|
uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
|
2013-07-26 20:29:15 +02:00
|
|
|
glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
|
2016-07-26 02:39:16 +02:00
|
|
|
oi, retaddr);
|
2003-08-09 01:58:05 +02:00
|
|
|
}
|
2013-07-26 20:29:15 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
haddr = addr + env->tlb_table[mmu_idx][index].addend;
|
2013-09-04 20:45:20 +02:00
|
|
|
glue(glue(st, SUFFIX), _be_p)((uint8_t *)haddr, val);
|
2003-08-09 01:58:05 +02:00
|
|
|
}
|
2013-09-04 20:45:20 +02:00
|
|
|
#endif /* DATA_SIZE > 1 */
|
2003-08-09 01:58:05 +02:00
|
|
|
|
2015-06-01 13:13:23 +02:00
|
|
|
#if DATA_SIZE == 1
|
|
|
|
/* Probe for whether the specified guest write access is permitted.
|
|
|
|
* If it is not permitted then an exception will be taken in the same
|
|
|
|
* way as if this were a real write access (and we will not return).
|
|
|
|
* Otherwise the function will return, and there will be a valid
|
|
|
|
* entry in the TLB for this access.
|
|
|
|
*/
|
|
|
|
void probe_write(CPUArchState *env, target_ulong addr, int mmu_idx,
|
|
|
|
uintptr_t retaddr)
|
|
|
|
{
|
|
|
|
int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
|
|
|
|
target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
|
|
|
|
|
|
|
|
if ((addr & TARGET_PAGE_MASK)
|
|
|
|
!= (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
|
|
|
|
/* TLB entry is for a different page */
|
2016-07-06 20:26:52 +02:00
|
|
|
if (!VICTIM_TLB_HIT(addr_write, addr)) {
|
2015-06-01 13:13:23 +02:00
|
|
|
tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2004-10-03 17:07:13 +02:00
|
|
|
#endif /* !defined(SOFTMMU_CODE_ACCESS) */
|
|
|
|
|
|
|
|
#undef READ_ACCESS_TYPE
|
2003-08-09 01:58:05 +02:00
|
|
|
#undef SHIFT
|
|
|
|
#undef DATA_TYPE
|
|
|
|
#undef SUFFIX
|
2013-08-27 20:31:48 +02:00
|
|
|
#undef LSUFFIX
|
2003-08-09 01:58:05 +02:00
|
|
|
#undef DATA_SIZE
|
2005-11-28 22:19:04 +01:00
|
|
|
#undef ADDR_READ
|
2013-08-27 23:09:14 +02:00
|
|
|
#undef WORD_TYPE
|
|
|
|
#undef SDATA_TYPE
|
|
|
|
#undef USUFFIX
|
|
|
|
#undef SSUFFIX
|
2013-09-04 20:45:20 +02:00
|
|
|
#undef BSWAP
|
|
|
|
#undef TGT_BE
|
|
|
|
#undef TGT_LE
|
|
|
|
#undef CPU_BE
|
|
|
|
#undef CPU_LE
|
|
|
|
#undef helper_le_ld_name
|
|
|
|
#undef helper_be_ld_name
|
|
|
|
#undef helper_le_lds_name
|
|
|
|
#undef helper_be_lds_name
|
|
|
|
#undef helper_le_st_name
|
|
|
|
#undef helper_be_st_name
|
|
|
|
#undef helper_te_ld_name
|
|
|
|
#undef helper_te_st_name
|