32ead8e62f
The name "iothread" is overloaded. Use the term Big QEMU Lock (BQL) instead, it is already widely used and unambiguous. Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Reviewed-by: Paul Durrant <paul@xen.org> Acked-by: David Woodhouse <dwmw@amazon.co.uk> Reviewed-by: Cédric Le Goater <clg@kaod.org> Acked-by: Ilya Leoshkevich <iii@linux.ibm.com> Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com> Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com> Message-id: 20240102153529.486531-3-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
551 lines
14 KiB
C
551 lines
14 KiB
C
/*
|
|
* QEMU Xen emulation: Grant table support
|
|
*
|
|
* Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
*
|
|
* Authors: David Woodhouse <dwmw2@infradead.org>
|
|
*
|
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
|
* See the COPYING file in the top-level directory.
|
|
*/
|
|
|
|
#include "qemu/osdep.h"
|
|
#include "qemu/host-utils.h"
|
|
#include "qemu/module.h"
|
|
#include "qemu/lockable.h"
|
|
#include "qemu/main-loop.h"
|
|
#include "qapi/error.h"
|
|
#include "qom/object.h"
|
|
#include "exec/target_page.h"
|
|
#include "exec/address-spaces.h"
|
|
#include "migration/vmstate.h"
|
|
|
|
#include "hw/sysbus.h"
|
|
#include "hw/xen/xen.h"
|
|
#include "hw/xen/xen_backend_ops.h"
|
|
#include "xen_overlay.h"
|
|
#include "xen_gnttab.h"
|
|
#include "xen_primary_console.h"
|
|
|
|
#include "sysemu/kvm.h"
|
|
#include "sysemu/kvm_xen.h"
|
|
|
|
#include "hw/xen/interface/memory.h"
|
|
#include "hw/xen/interface/grant_table.h"
|
|
|
|
#define TYPE_XEN_GNTTAB "xen-gnttab"
|
|
OBJECT_DECLARE_SIMPLE_TYPE(XenGnttabState, XEN_GNTTAB)
|
|
|
|
#define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t))
|
|
|
|
static struct gnttab_backend_ops emu_gnttab_backend_ops;
|
|
|
|
struct XenGnttabState {
|
|
/*< private >*/
|
|
SysBusDevice busdev;
|
|
/*< public >*/
|
|
|
|
QemuMutex gnt_lock;
|
|
|
|
uint32_t nr_frames;
|
|
uint32_t max_frames;
|
|
|
|
union {
|
|
grant_entry_v1_t *v1;
|
|
/* Theoretically, v2 support could be added here. */
|
|
} entries;
|
|
|
|
MemoryRegion gnt_frames;
|
|
MemoryRegion *gnt_aliases;
|
|
uint64_t *gnt_frame_gpas;
|
|
|
|
uint8_t *map_track;
|
|
};
|
|
|
|
struct XenGnttabState *xen_gnttab_singleton;
|
|
|
|
static void xen_gnttab_realize(DeviceState *dev, Error **errp)
|
|
{
|
|
XenGnttabState *s = XEN_GNTTAB(dev);
|
|
int i;
|
|
|
|
if (xen_mode != XEN_EMULATE) {
|
|
error_setg(errp, "Xen grant table support is for Xen emulation");
|
|
return;
|
|
}
|
|
s->max_frames = kvm_xen_get_gnttab_max_frames();
|
|
memory_region_init_ram(&s->gnt_frames, OBJECT(dev), "xen:grant_table",
|
|
XEN_PAGE_SIZE * s->max_frames, &error_abort);
|
|
memory_region_set_enabled(&s->gnt_frames, true);
|
|
s->entries.v1 = memory_region_get_ram_ptr(&s->gnt_frames);
|
|
|
|
/* Create individual page-sizes aliases for overlays */
|
|
s->gnt_aliases = (void *)g_new0(MemoryRegion, s->max_frames);
|
|
s->gnt_frame_gpas = (void *)g_new(uint64_t, s->max_frames);
|
|
for (i = 0; i < s->max_frames; i++) {
|
|
memory_region_init_alias(&s->gnt_aliases[i], OBJECT(dev),
|
|
NULL, &s->gnt_frames,
|
|
i * XEN_PAGE_SIZE, XEN_PAGE_SIZE);
|
|
s->gnt_frame_gpas[i] = INVALID_GPA;
|
|
}
|
|
|
|
s->nr_frames = 0;
|
|
memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
|
|
s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access;
|
|
s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE);
|
|
|
|
qemu_mutex_init(&s->gnt_lock);
|
|
|
|
xen_gnttab_singleton = s;
|
|
|
|
s->map_track = g_new0(uint8_t, s->max_frames * ENTRIES_PER_FRAME_V1);
|
|
|
|
xen_gnttab_ops = &emu_gnttab_backend_ops;
|
|
}
|
|
|
|
static int xen_gnttab_post_load(void *opaque, int version_id)
|
|
{
|
|
XenGnttabState *s = XEN_GNTTAB(opaque);
|
|
uint32_t i;
|
|
|
|
for (i = 0; i < s->nr_frames; i++) {
|
|
if (s->gnt_frame_gpas[i] != INVALID_GPA) {
|
|
xen_overlay_do_map_page(&s->gnt_aliases[i], s->gnt_frame_gpas[i]);
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static bool xen_gnttab_is_needed(void *opaque)
|
|
{
|
|
return xen_mode == XEN_EMULATE;
|
|
}
|
|
|
|
static const VMStateDescription xen_gnttab_vmstate = {
|
|
.name = "xen_gnttab",
|
|
.version_id = 1,
|
|
.minimum_version_id = 1,
|
|
.needed = xen_gnttab_is_needed,
|
|
.post_load = xen_gnttab_post_load,
|
|
.fields = (const VMStateField[]) {
|
|
VMSTATE_UINT32(nr_frames, XenGnttabState),
|
|
VMSTATE_VARRAY_UINT32(gnt_frame_gpas, XenGnttabState, nr_frames, 0,
|
|
vmstate_info_uint64, uint64_t),
|
|
VMSTATE_END_OF_LIST()
|
|
}
|
|
};
|
|
|
|
static void xen_gnttab_class_init(ObjectClass *klass, void *data)
|
|
{
|
|
DeviceClass *dc = DEVICE_CLASS(klass);
|
|
|
|
dc->realize = xen_gnttab_realize;
|
|
dc->vmsd = &xen_gnttab_vmstate;
|
|
}
|
|
|
|
static const TypeInfo xen_gnttab_info = {
|
|
.name = TYPE_XEN_GNTTAB,
|
|
.parent = TYPE_SYS_BUS_DEVICE,
|
|
.instance_size = sizeof(XenGnttabState),
|
|
.class_init = xen_gnttab_class_init,
|
|
};
|
|
|
|
void xen_gnttab_create(void)
|
|
{
|
|
xen_gnttab_singleton = XEN_GNTTAB(sysbus_create_simple(TYPE_XEN_GNTTAB,
|
|
-1, NULL));
|
|
}
|
|
|
|
static void xen_gnttab_register_types(void)
|
|
{
|
|
type_register_static(&xen_gnttab_info);
|
|
}
|
|
|
|
type_init(xen_gnttab_register_types)
|
|
|
|
int xen_gnttab_map_page(uint64_t idx, uint64_t gfn)
|
|
{
|
|
XenGnttabState *s = xen_gnttab_singleton;
|
|
uint64_t gpa = gfn << XEN_PAGE_SHIFT;
|
|
|
|
if (!s) {
|
|
return -ENOTSUP;
|
|
}
|
|
|
|
if (idx >= s->max_frames) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
BQL_LOCK_GUARD();
|
|
QEMU_LOCK_GUARD(&s->gnt_lock);
|
|
|
|
xen_overlay_do_map_page(&s->gnt_aliases[idx], gpa);
|
|
|
|
s->gnt_frame_gpas[idx] = gpa;
|
|
|
|
if (s->nr_frames <= idx) {
|
|
s->nr_frames = idx + 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int xen_gnttab_set_version_op(struct gnttab_set_version *set)
|
|
{
|
|
int ret;
|
|
|
|
switch (set->version) {
|
|
case 1:
|
|
ret = 0;
|
|
break;
|
|
|
|
case 2:
|
|
/* Behave as before set_version was introduced. */
|
|
ret = -ENOSYS;
|
|
break;
|
|
|
|
default:
|
|
ret = -EINVAL;
|
|
}
|
|
|
|
set->version = 1;
|
|
return ret;
|
|
}
|
|
|
|
int xen_gnttab_get_version_op(struct gnttab_get_version *get)
|
|
{
|
|
if (get->dom != DOMID_SELF && get->dom != xen_domid) {
|
|
return -ESRCH;
|
|
}
|
|
|
|
get->version = 1;
|
|
return 0;
|
|
}
|
|
|
|
int xen_gnttab_query_size_op(struct gnttab_query_size *size)
|
|
{
|
|
XenGnttabState *s = xen_gnttab_singleton;
|
|
|
|
if (!s) {
|
|
return -ENOTSUP;
|
|
}
|
|
|
|
if (size->dom != DOMID_SELF && size->dom != xen_domid) {
|
|
size->status = GNTST_bad_domain;
|
|
return 0;
|
|
}
|
|
|
|
size->status = GNTST_okay;
|
|
size->nr_frames = s->nr_frames;
|
|
size->max_nr_frames = s->max_frames;
|
|
return 0;
|
|
}
|
|
|
|
/* Track per-open refs, to allow close() to clean up. */
|
|
struct active_ref {
|
|
MemoryRegionSection mrs;
|
|
void *virtaddr;
|
|
uint32_t refcnt;
|
|
int prot;
|
|
};
|
|
|
|
static void gnt_unref(XenGnttabState *s, grant_ref_t ref,
|
|
MemoryRegionSection *mrs, int prot)
|
|
{
|
|
if (mrs && mrs->mr) {
|
|
if (prot & PROT_WRITE) {
|
|
memory_region_set_dirty(mrs->mr, mrs->offset_within_region,
|
|
XEN_PAGE_SIZE);
|
|
}
|
|
memory_region_unref(mrs->mr);
|
|
mrs->mr = NULL;
|
|
}
|
|
assert(s->map_track[ref] != 0);
|
|
|
|
if (--s->map_track[ref] == 0) {
|
|
grant_entry_v1_t *gnt_p = &s->entries.v1[ref];
|
|
qatomic_and(&gnt_p->flags, (uint16_t)~(GTF_reading | GTF_writing));
|
|
}
|
|
}
|
|
|
|
static uint64_t gnt_ref(XenGnttabState *s, grant_ref_t ref, int prot)
|
|
{
|
|
uint16_t mask = GTF_type_mask | GTF_sub_page;
|
|
grant_entry_v1_t gnt, *gnt_p;
|
|
int retries = 0;
|
|
|
|
if (ref >= s->max_frames * ENTRIES_PER_FRAME_V1 ||
|
|
s->map_track[ref] == UINT8_MAX) {
|
|
return INVALID_GPA;
|
|
}
|
|
|
|
if (prot & PROT_WRITE) {
|
|
mask |= GTF_readonly;
|
|
}
|
|
|
|
gnt_p = &s->entries.v1[ref];
|
|
|
|
/*
|
|
* The guest can legitimately be changing the GTF_readonly flag. Allow
|
|
* that, but don't let a malicious guest cause a livelock.
|
|
*/
|
|
for (retries = 0; retries < 5; retries++) {
|
|
uint16_t new_flags;
|
|
|
|
/* Read the entry before an atomic operation on its flags */
|
|
gnt = *(volatile grant_entry_v1_t *)gnt_p;
|
|
|
|
if ((gnt.flags & mask) != GTF_permit_access ||
|
|
gnt.domid != DOMID_QEMU) {
|
|
return INVALID_GPA;
|
|
}
|
|
|
|
new_flags = gnt.flags | GTF_reading;
|
|
if (prot & PROT_WRITE) {
|
|
new_flags |= GTF_writing;
|
|
}
|
|
|
|
if (qatomic_cmpxchg(&gnt_p->flags, gnt.flags, new_flags) == gnt.flags) {
|
|
return (uint64_t)gnt.frame << XEN_PAGE_SHIFT;
|
|
}
|
|
}
|
|
|
|
return INVALID_GPA;
|
|
}
|
|
|
|
struct xengntdev_handle {
|
|
GHashTable *active_maps;
|
|
};
|
|
|
|
static int xen_be_gnttab_set_max_grants(struct xengntdev_handle *xgt,
|
|
uint32_t nr_grants)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static void *xen_be_gnttab_map_refs(struct xengntdev_handle *xgt,
|
|
uint32_t count, uint32_t domid,
|
|
uint32_t *refs, int prot)
|
|
{
|
|
XenGnttabState *s = xen_gnttab_singleton;
|
|
struct active_ref *act;
|
|
|
|
if (!s) {
|
|
errno = ENOTSUP;
|
|
return NULL;
|
|
}
|
|
|
|
if (domid != xen_domid) {
|
|
errno = EINVAL;
|
|
return NULL;
|
|
}
|
|
|
|
if (!count || count > 4096) {
|
|
errno = EINVAL;
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* Making a contiguous mapping from potentially discontiguous grant
|
|
* references would be... distinctly non-trivial. We don't support it.
|
|
* Even changing the API to return an array of pointers, one per page,
|
|
* wouldn't be simple to use in PV backends because some structures
|
|
* actually cross page boundaries (e.g. 32-bit blkif_response ring
|
|
* entries are 12 bytes).
|
|
*/
|
|
if (count != 1) {
|
|
errno = EINVAL;
|
|
return NULL;
|
|
}
|
|
|
|
QEMU_LOCK_GUARD(&s->gnt_lock);
|
|
|
|
act = g_hash_table_lookup(xgt->active_maps, GINT_TO_POINTER(refs[0]));
|
|
if (act) {
|
|
if ((prot & PROT_WRITE) && !(act->prot & PROT_WRITE)) {
|
|
if (gnt_ref(s, refs[0], prot) == INVALID_GPA) {
|
|
return NULL;
|
|
}
|
|
act->prot |= PROT_WRITE;
|
|
}
|
|
act->refcnt++;
|
|
} else {
|
|
uint64_t gpa = gnt_ref(s, refs[0], prot);
|
|
if (gpa == INVALID_GPA) {
|
|
errno = EINVAL;
|
|
return NULL;
|
|
}
|
|
|
|
act = g_new0(struct active_ref, 1);
|
|
act->prot = prot;
|
|
act->refcnt = 1;
|
|
act->mrs = memory_region_find(get_system_memory(), gpa, XEN_PAGE_SIZE);
|
|
|
|
if (act->mrs.mr &&
|
|
!int128_lt(act->mrs.size, int128_make64(XEN_PAGE_SIZE)) &&
|
|
memory_region_get_ram_addr(act->mrs.mr) != RAM_ADDR_INVALID) {
|
|
act->virtaddr = qemu_map_ram_ptr(act->mrs.mr->ram_block,
|
|
act->mrs.offset_within_region);
|
|
}
|
|
if (!act->virtaddr) {
|
|
gnt_unref(s, refs[0], &act->mrs, 0);
|
|
g_free(act);
|
|
errno = EINVAL;
|
|
return NULL;
|
|
}
|
|
|
|
s->map_track[refs[0]]++;
|
|
g_hash_table_insert(xgt->active_maps, GINT_TO_POINTER(refs[0]), act);
|
|
}
|
|
|
|
return act->virtaddr;
|
|
}
|
|
|
|
static gboolean do_unmap(gpointer key, gpointer value, gpointer user_data)
|
|
{
|
|
XenGnttabState *s = user_data;
|
|
grant_ref_t gref = GPOINTER_TO_INT(key);
|
|
struct active_ref *act = value;
|
|
|
|
gnt_unref(s, gref, &act->mrs, act->prot);
|
|
g_free(act);
|
|
return true;
|
|
}
|
|
|
|
static int xen_be_gnttab_unmap(struct xengntdev_handle *xgt,
|
|
void *start_address, uint32_t *refs,
|
|
uint32_t count)
|
|
{
|
|
XenGnttabState *s = xen_gnttab_singleton;
|
|
struct active_ref *act;
|
|
|
|
if (!s) {
|
|
return -ENOTSUP;
|
|
}
|
|
|
|
if (count != 1) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
QEMU_LOCK_GUARD(&s->gnt_lock);
|
|
|
|
act = g_hash_table_lookup(xgt->active_maps, GINT_TO_POINTER(refs[0]));
|
|
if (!act) {
|
|
return -ENOENT;
|
|
}
|
|
|
|
if (act->virtaddr != start_address) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (!--act->refcnt) {
|
|
do_unmap(GINT_TO_POINTER(refs[0]), act, s);
|
|
g_hash_table_remove(xgt->active_maps, GINT_TO_POINTER(refs[0]));
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* This looks a bit like the one for true Xen in xen-operations.c but
|
|
* in emulation we don't support multi-page mappings. And under Xen we
|
|
* *want* the multi-page mappings so we have fewer bounces through the
|
|
* kernel and the hypervisor. So the code paths end up being similar,
|
|
* but different.
|
|
*/
|
|
static int xen_be_gnttab_copy(struct xengntdev_handle *xgt, bool to_domain,
|
|
uint32_t domid, XenGrantCopySegment *segs,
|
|
uint32_t nr_segs, Error **errp)
|
|
{
|
|
int prot = to_domain ? PROT_WRITE : PROT_READ;
|
|
unsigned int i;
|
|
|
|
for (i = 0; i < nr_segs; i++) {
|
|
XenGrantCopySegment *seg = &segs[i];
|
|
void *page;
|
|
uint32_t ref = to_domain ? seg->dest.foreign.ref :
|
|
seg->source.foreign.ref;
|
|
|
|
page = xen_be_gnttab_map_refs(xgt, 1, domid, &ref, prot);
|
|
if (!page) {
|
|
if (errp) {
|
|
error_setg_errno(errp, errno,
|
|
"xen_be_gnttab_map_refs failed");
|
|
}
|
|
return -errno;
|
|
}
|
|
|
|
if (to_domain) {
|
|
memcpy(page + seg->dest.foreign.offset, seg->source.virt,
|
|
seg->len);
|
|
} else {
|
|
memcpy(seg->dest.virt, page + seg->source.foreign.offset,
|
|
seg->len);
|
|
}
|
|
|
|
if (xen_be_gnttab_unmap(xgt, page, &ref, 1)) {
|
|
if (errp) {
|
|
error_setg_errno(errp, errno, "xen_be_gnttab_unmap failed");
|
|
}
|
|
return -errno;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static struct xengntdev_handle *xen_be_gnttab_open(void)
|
|
{
|
|
struct xengntdev_handle *xgt = g_new0(struct xengntdev_handle, 1);
|
|
|
|
xgt->active_maps = g_hash_table_new(g_direct_hash, g_direct_equal);
|
|
return xgt;
|
|
}
|
|
|
|
static int xen_be_gnttab_close(struct xengntdev_handle *xgt)
|
|
{
|
|
XenGnttabState *s = xen_gnttab_singleton;
|
|
|
|
if (!s) {
|
|
return -ENOTSUP;
|
|
}
|
|
|
|
g_hash_table_foreach_remove(xgt->active_maps, do_unmap, s);
|
|
g_hash_table_destroy(xgt->active_maps);
|
|
g_free(xgt);
|
|
return 0;
|
|
}
|
|
|
|
static struct gnttab_backend_ops emu_gnttab_backend_ops = {
|
|
.open = xen_be_gnttab_open,
|
|
.close = xen_be_gnttab_close,
|
|
.grant_copy = xen_be_gnttab_copy,
|
|
.set_max_grants = xen_be_gnttab_set_max_grants,
|
|
.map_refs = xen_be_gnttab_map_refs,
|
|
.unmap = xen_be_gnttab_unmap,
|
|
};
|
|
|
|
int xen_gnttab_reset(void)
|
|
{
|
|
XenGnttabState *s = xen_gnttab_singleton;
|
|
|
|
if (!s) {
|
|
return -ENOTSUP;
|
|
}
|
|
|
|
QEMU_LOCK_GUARD(&s->gnt_lock);
|
|
|
|
s->nr_frames = 0;
|
|
|
|
memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
|
|
s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access;
|
|
s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE);
|
|
|
|
if (xen_primary_console_get_pfn()) {
|
|
s->entries.v1[GNTTAB_RESERVED_CONSOLE].flags = GTF_permit_access;
|
|
s->entries.v1[GNTTAB_RESERVED_CONSOLE].frame = XEN_SPECIAL_PFN(CONSOLE);
|
|
}
|
|
|
|
return 0;
|
|
}
|