vfio queue:

* Introduce an IOMMU interface backend for VFIO devices
 * Convert IOMMU type1 and sPAPR IOMMU to respective backends
 * Introduce a new IOMMUFD backend for ARM, x86_64 and s390x platforms
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmWB34AACgkQUaNDx8/7
 7KGOMxAAqXegvAneHqIlu4c8TzTuUR2rkYgev9RdfIHRDuY2XtaX14xlWn/rpTXZ
 qSgeta+iT8Cv4YV1POJeHWFDNs9E29p1w+R7nLcH1qTIIaZHtxwbVVQ3s7kAo1Vb
 1S1G0/zIznzGVI50a0lj1gO2yQJnu/79nXpnICgA5REW0CscMssnvboQODlwq17V
 ZLNVM8CSAvKl6ppkmzRdfNXCfq6x7bf4MsvnuXsqda4TBbvyyTjAqdo/8sjKiGly
 gSDQqhgy6cvEXIF0UUHPJzFApf0YdXUDlL8hzH90hvRVu4W/t24dPmT7UkVIX9Ek
 TA7RVxv7iJlHtFDqfSTAJFr7nKO9Tm2V9N7xbD1OJUKrMoPZRT6+0R1hMKqsZ5z+
 nG6khqHGzuo/aI9n70YxYIPXt+vs/EHI4WUtslGLUTL0xv8lUzk6cxyIJupFRmDS
 ix6GM9TXOV8RyOveL2knHVymlFnAR6dekkMB+6ljUTuzDwG0oco4vno8z9bi7Vct
 j36bM56U3lhY+w+Ljoy0gPwgrw/FROnGG3mp1mwp1KRHqtEDnUQu8CaLbJOBsBGE
 JJDP6AKAYMczdmYVkd4CvE0WaeSxtOUxW5H5NCPjtaFQt0qEcght2lA2K15g521q
 jeojoJ/QK5949jnNCqm1Z66/YQVL79lPyL0E+mxEohwu+yTORk4=
 =U0x5
 -----END PGP SIGNATURE-----

Merge tag 'pull-vfio-20231219' of https://github.com/legoater/qemu into staging

vfio queue:

* Introduce an IOMMU interface backend for VFIO devices
* Convert IOMMU type1 and sPAPR IOMMU to respective backends
* Introduce a new IOMMUFD backend for ARM, x86_64 and s390x platforms

# -----BEGIN PGP SIGNATURE-----
#
# iQIzBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmWB34AACgkQUaNDx8/7
# 7KGOMxAAqXegvAneHqIlu4c8TzTuUR2rkYgev9RdfIHRDuY2XtaX14xlWn/rpTXZ
# qSgeta+iT8Cv4YV1POJeHWFDNs9E29p1w+R7nLcH1qTIIaZHtxwbVVQ3s7kAo1Vb
# 1S1G0/zIznzGVI50a0lj1gO2yQJnu/79nXpnICgA5REW0CscMssnvboQODlwq17V
# ZLNVM8CSAvKl6ppkmzRdfNXCfq6x7bf4MsvnuXsqda4TBbvyyTjAqdo/8sjKiGly
# gSDQqhgy6cvEXIF0UUHPJzFApf0YdXUDlL8hzH90hvRVu4W/t24dPmT7UkVIX9Ek
# TA7RVxv7iJlHtFDqfSTAJFr7nKO9Tm2V9N7xbD1OJUKrMoPZRT6+0R1hMKqsZ5z+
# nG6khqHGzuo/aI9n70YxYIPXt+vs/EHI4WUtslGLUTL0xv8lUzk6cxyIJupFRmDS
# ix6GM9TXOV8RyOveL2knHVymlFnAR6dekkMB+6ljUTuzDwG0oco4vno8z9bi7Vct
# j36bM56U3lhY+w+Ljoy0gPwgrw/FROnGG3mp1mwp1KRHqtEDnUQu8CaLbJOBsBGE
# JJDP6AKAYMczdmYVkd4CvE0WaeSxtOUxW5H5NCPjtaFQt0qEcght2lA2K15g521q
# jeojoJ/QK5949jnNCqm1Z66/YQVL79lPyL0E+mxEohwu+yTORk4=
# =U0x5
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 19 Dec 2023 13:22:56 EST
# gpg:                using RSA key A0F66548F04895EBFE6B0B6051A343C7CFFBECA1
# gpg: Good signature from "Cédric Le Goater <clg@redhat.com>" [unknown]
# gpg:                 aka "Cédric Le Goater <clg@kaod.org>" [unknown]
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg:          There is no indication that the signature belongs to the owner.
# Primary key fingerprint: A0F6 6548 F048 95EB FE6B  0B60 51A3 43C7 CFFB ECA1

* tag 'pull-vfio-20231219' of https://github.com/legoater/qemu: (47 commits)
  hw/ppc/Kconfig: Imply VFIO_PCI
  docs/devel: Add VFIO iommufd backend documentation
  vfio: Introduce a helper function to initialize VFIODevice
  vfio/ccw: Move VFIODevice initializations in vfio_ccw_instance_init
  vfio/ap: Move VFIODevice initializations in vfio_ap_instance_init
  vfio/platform: Move VFIODevice initializations in vfio_platform_instance_init
  vfio/pci: Move VFIODevice initializations in vfio_instance_init
  hw/i386: Activate IOMMUFD for q35 machines
  kconfig: Activate IOMMUFD for s390x machines
  hw/arm: Activate IOMMUFD for virt machines
  vfio: Make VFIOContainerBase poiner parameter const in VFIOIOMMUOps callbacks
  vfio/ccw: Make vfio cdev pre-openable by passing a file handle
  vfio/ccw: Allow the selection of a given iommu backend
  vfio/ap: Make vfio cdev pre-openable by passing a file handle
  vfio/ap: Allow the selection of a given iommu backend
  vfio/platform: Make vfio cdev pre-openable by passing a file handle
  vfio/platform: Allow the selection of a given iommu backend
  vfio/pci: Make vfio cdev pre-openable by passing a file handle
  vfio/pci: Allow the selection of a given iommu backend
  vfio/iommufd: Enable pci hot reset through iommufd cdev interface
  ...

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2023-12-20 09:39:18 -05:00
commit dd7d3e3540
33 changed files with 2229 additions and 514 deletions

View File

@ -2167,6 +2167,17 @@ F: hw/vfio/ap.c
F: docs/system/s390x/vfio-ap.rst
L: qemu-s390x@nongnu.org
iommufd
M: Yi Liu <yi.l.liu@intel.com>
M: Eric Auger <eric.auger@redhat.com>
M: Zhenzhong Duan <zhenzhong.duan@intel.com>
S: Supported
F: backends/iommufd.c
F: include/sysemu/iommufd.h
F: include/qemu/chardev_open.h
F: util/chardev_open.c
F: docs/devel/vfio-iommufd.rst
vhost
M: Michael S. Tsirkin <mst@redhat.com>
S: Supported

View File

@ -1 +1,5 @@
source tpm/Kconfig
config IOMMUFD
bool
depends on VFIO

245
backends/iommufd.c Normal file
View File

@ -0,0 +1,245 @@
/*
* iommufd container backend
*
* Copyright (C) 2023 Intel Corporation.
* Copyright Red Hat, Inc. 2023
*
* Authors: Yi Liu <yi.l.liu@intel.com>
* Eric Auger <eric.auger@redhat.com>
*
* SPDX-License-Identifier: GPL-2.0-or-later
*/
#include "qemu/osdep.h"
#include "sysemu/iommufd.h"
#include "qapi/error.h"
#include "qapi/qmp/qerror.h"
#include "qemu/module.h"
#include "qom/object_interfaces.h"
#include "qemu/error-report.h"
#include "monitor/monitor.h"
#include "trace.h"
#include <sys/ioctl.h>
#include <linux/iommufd.h>
static void iommufd_backend_init(Object *obj)
{
IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
be->fd = -1;
be->users = 0;
be->owned = true;
qemu_mutex_init(&be->lock);
}
static void iommufd_backend_finalize(Object *obj)
{
IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
if (be->owned) {
close(be->fd);
be->fd = -1;
}
}
static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
{
IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
int fd = -1;
fd = monitor_fd_param(monitor_cur(), str, errp);
if (fd == -1) {
error_prepend(errp, "Could not parse remote object fd %s:", str);
return;
}
qemu_mutex_lock(&be->lock);
be->fd = fd;
be->owned = false;
qemu_mutex_unlock(&be->lock);
trace_iommu_backend_set_fd(be->fd);
}
static bool iommufd_backend_can_be_deleted(UserCreatable *uc)
{
IOMMUFDBackend *be = IOMMUFD_BACKEND(uc);
return !be->users;
}
static void iommufd_backend_class_init(ObjectClass *oc, void *data)
{
UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
ucc->can_be_deleted = iommufd_backend_can_be_deleted;
object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
}
int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
{
int fd, ret = 0;
qemu_mutex_lock(&be->lock);
if (be->users == UINT32_MAX) {
error_setg(errp, "too many connections");
ret = -E2BIG;
goto out;
}
if (be->owned && !be->users) {
fd = qemu_open_old("/dev/iommu", O_RDWR);
if (fd < 0) {
error_setg_errno(errp, errno, "/dev/iommu opening failed");
ret = fd;
goto out;
}
be->fd = fd;
}
be->users++;
out:
trace_iommufd_backend_connect(be->fd, be->owned,
be->users, ret);
qemu_mutex_unlock(&be->lock);
return ret;
}
void iommufd_backend_disconnect(IOMMUFDBackend *be)
{
qemu_mutex_lock(&be->lock);
if (!be->users) {
goto out;
}
be->users--;
if (!be->users && be->owned) {
close(be->fd);
be->fd = -1;
}
out:
trace_iommufd_backend_disconnect(be->fd, be->users);
qemu_mutex_unlock(&be->lock);
}
int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
Error **errp)
{
int ret, fd = be->fd;
struct iommu_ioas_alloc alloc_data = {
.size = sizeof(alloc_data),
.flags = 0,
};
ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
if (ret) {
error_setg_errno(errp, errno, "Failed to allocate ioas");
return ret;
}
*ioas_id = alloc_data.out_ioas_id;
trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
return ret;
}
void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
{
int ret, fd = be->fd;
struct iommu_destroy des = {
.size = sizeof(des),
.id = id,
};
ret = ioctl(fd, IOMMU_DESTROY, &des);
trace_iommufd_backend_free_id(fd, id, ret);
if (ret) {
error_report("Failed to free id: %u %m", id);
}
}
int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
ram_addr_t size, void *vaddr, bool readonly)
{
int ret, fd = be->fd;
struct iommu_ioas_map map = {
.size = sizeof(map),
.flags = IOMMU_IOAS_MAP_READABLE |
IOMMU_IOAS_MAP_FIXED_IOVA,
.ioas_id = ioas_id,
.__reserved = 0,
.user_va = (uintptr_t)vaddr,
.iova = iova,
.length = size,
};
if (!readonly) {
map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
}
ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
vaddr, readonly, ret);
if (ret) {
ret = -errno;
/* TODO: Not support mapping hardware PCI BAR region for now. */
if (errno == EFAULT) {
warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?");
} else {
error_report("IOMMU_IOAS_MAP failed: %m");
}
}
return ret;
}
int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
hwaddr iova, ram_addr_t size)
{
int ret, fd = be->fd;
struct iommu_ioas_unmap unmap = {
.size = sizeof(unmap),
.ioas_id = ioas_id,
.iova = iova,
.length = size,
};
ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
/*
* IOMMUFD takes mapping as some kind of object, unmapping
* nonexistent mapping is treated as deleting a nonexistent
* object and return ENOENT. This is different from legacy
* backend which allows it. vIOMMU may trigger a lot of
* redundant unmapping, to avoid flush the log, treat them
* as succeess for IOMMUFD just like legacy backend.
*/
if (ret && errno == ENOENT) {
trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret);
ret = 0;
} else {
trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret);
}
if (ret) {
ret = -errno;
error_report("IOMMU_IOAS_UNMAP failed: %m");
}
return ret;
}
static const TypeInfo iommufd_backend_info = {
.name = TYPE_IOMMUFD_BACKEND,
.parent = TYPE_OBJECT,
.instance_size = sizeof(IOMMUFDBackend),
.instance_init = iommufd_backend_init,
.instance_finalize = iommufd_backend_finalize,
.class_size = sizeof(IOMMUFDBackendClass),
.class_init = iommufd_backend_class_init,
.interfaces = (InterfaceInfo[]) {
{ TYPE_USER_CREATABLE },
{ }
}
};
static void register_types(void)
{
type_register_static(&iommufd_backend_info);
}
type_init(register_types);

View File

@ -20,6 +20,7 @@ if have_vhost_user
system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c'))
endif
system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c'))
system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
if have_vhost_user_crypto
system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c'))
endif

View File

@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void)
dbus_vmstate_post_load(int version_id) "version_id: %d"
dbus_vmstate_loading(const char *id) "id: %s"
dbus_vmstate_saving(const char *id) "id: %s"
# iommufd.c
iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)"
iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d"
iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)"
iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)"

View File

@ -18,5 +18,6 @@ Details about QEMU's various subsystems including how to add features to them.
s390-dasd-ipl
tracing
vfio-migration
vfio-iommufd
writing-monitor-commands
virtio-backends

166
docs/devel/vfio-iommufd.rst Normal file
View File

@ -0,0 +1,166 @@
===============================
IOMMUFD BACKEND usage with VFIO
===============================
(Same meaning for backend/container/BE)
With the introduction of iommufd, the Linux kernel provides a generic
interface for user space drivers to propagate their DMA mappings to kernel
for assigned devices. While the legacy kernel interface is group-centric,
the new iommufd interface is device-centric, relying on device fd and iommufd.
To support both interfaces in the QEMU VFIO device, introduce a base container
to abstract the common part of VFIO legacy and iommufd container. So that the
generic VFIO code can use either container.
The base container implements generic functions such as memory_listener and
address space management whereas the derived container implements callbacks
specific to either legacy or iommufd. Each container has its own way to setup
secure context and dma management interface. The below diagram shows how it
looks like with both containers.
::
VFIO AddressSpace/Memory
+-------+ +----------+ +-----+ +-----+
| pci | | platform | | ap | | ccw |
+---+---+ +----+-----+ +--+--+ +--+--+ +----------------------+
| | | | | AddressSpace |
| | | | +------------+---------+
+---V-----------V-----------V--------V----+ /
| VFIOAddressSpace | <------------+
| | | MemoryListener
| VFIOContainerBase list |
+-------+----------------------------+----+
| |
| |
+-------V------+ +--------V----------+
| iommufd | | vfio legacy |
| container | | container |
+-------+------+ +--------+----------+
| |
| /dev/iommu | /dev/vfio/vfio
| /dev/vfio/devices/vfioX | /dev/vfio/$group_id
Userspace | |
============+============================+===========================
Kernel | device fd |
+---------------+ | group/container fd
| (BIND_IOMMUFD | | (SET_CONTAINER/SET_IOMMU)
| ATTACH_IOAS) | | device fd
| | |
| +-------V------------V-----------------+
iommufd | | vfio |
(map/unmap | +---------+--------------------+-------+
ioas_copy) | | | map/unmap
| | |
+------V------+ +-----V------+ +------V--------+
| iommfd core | | device | | vfio iommu |
+-------------+ +------------+ +---------------+
* Secure Context setup
- iommufd BE: uses device fd and iommufd to setup secure context
(bind_iommufd, attach_ioas)
- vfio legacy BE: uses group fd and container fd to setup secure context
(set_container, set_iommu)
* Device access
- iommufd BE: device fd is opened through ``/dev/vfio/devices/vfioX``
- vfio legacy BE: device fd is retrieved from group fd ioctl
* DMA Mapping flow
1. VFIOAddressSpace receives MemoryRegion add/del via MemoryListener
2. VFIO populates DMA map/unmap via the container BEs
* iommufd BE: uses iommufd
* vfio legacy BE: uses container fd
Example configuration
=====================
Step 1: configure the host device
---------------------------------
It's exactly same as the VFIO device with legacy VFIO container.
Step 2: configure QEMU
----------------------
Interactions with the ``/dev/iommu`` are abstracted by a new iommufd
object (compiled in with the ``CONFIG_IOMMUFD`` option).
Any QEMU device (e.g. VFIO device) wishing to use ``/dev/iommu`` must
be linked with an iommufd object. It gets a new optional property
named iommufd which allows to pass an iommufd object. Take ``vfio-pci``
device for example:
.. code-block:: bash
-object iommufd,id=iommufd0
-device vfio-pci,host=0000:02:00.0,iommufd=iommufd0
Note the ``/dev/iommu`` and VFIO cdev can be externally opened by a
management layer. In such a case the fd is passed, the fd supports a
string naming the fd or a number, for example:
.. code-block:: bash
-object iommufd,id=iommufd0,fd=22
-device vfio-pci,iommufd=iommufd0,fd=23
If the ``fd`` property is not passed, the fd is opened by QEMU.
If no ``iommufd`` object is passed to the ``vfio-pci`` device, iommufd
is not used and the user gets the behavior based on the legacy VFIO
container:
.. code-block:: bash
-device vfio-pci,host=0000:02:00.0
Supported platform
==================
Supports x86, ARM and s390x currently.
Caveats
=======
Dirty page sync
---------------
Dirty page sync with iommufd backend is unsupported yet, live migration is
disabled by default. But it can be force enabled like below, low efficient
though.
.. code-block:: bash
-object iommufd,id=iommufd0
-device vfio-pci,host=0000:02:00.0,iommufd=iommufd0,enable-migration=on
P2P DMA
-------
PCI p2p DMA is unsupported as IOMMUFD doesn't support mapping hardware PCI
BAR region yet. Below warning shows for assigned PCI device, it's not a bug.
.. code-block:: none
qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI BAR?
qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, 0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address)
FD passing with mdev
--------------------
``vfio-pci`` device checks sysfsdev property to decide if backend is a mdev.
If FD passing is used, there is no way to know that and the mdev is treated
like a real PCI device. There is an error as below if user wants to enable
RAM discarding for mdev.
.. code-block:: none
qemu-system-x86_64: -device vfio-pci,iommufd=iommufd0,x-balloon-allowed=on,fd=9: vfio VFIO_FD9: x-balloon-allowed only potentially compatible with mdev devices
``vfio-ap`` and ``vfio-ccw`` devices don't have same issue as their backend
devices are always mdev and RAM discarding is force enabled.

View File

@ -8,6 +8,7 @@ config ARM_VIRT
imply TPM_TIS_SYSBUS
imply TPM_TIS_I2C
imply NVDIMM
imply IOMMUFD
select ARM_GIC
select ACPI
select ARM_SMMUV3

View File

@ -95,6 +95,7 @@ config Q35
imply E1000E_PCI_EXPRESS
imply VMPORT
imply VMMOUSE
imply IOMMUFD
select PC_PCI
select PC_ACPI
select PCI_EXPRESS_Q35

View File

@ -3,11 +3,11 @@ config PSERIES
imply PCI_DEVICES
imply TEST_DEVICES
imply VIRTIO_VGA
imply VFIO_PCI if LINUX # needed by spapr_pci_vfio.c
select NVDIMM
select DIMM
select PCI
select SPAPR_VSCSI
select VFIO if LINUX # needed by spapr_pci_vfio.c
select XICS
select XIVE
select MSI_NONBROKEN

View File

@ -26,10 +26,12 @@
#include "hw/pci/pci_device.h"
#include "hw/vfio/vfio-common.h"
#include "qemu/error-report.h"
#include CONFIG_DEVICES /* CONFIG_VFIO_PCI */
/*
* Interfaces for IBM EEH (Enhanced Error Handling)
*/
#ifdef CONFIG_VFIO_PCI
static bool vfio_eeh_container_ok(VFIOContainer *container)
{
/*
@ -84,27 +86,27 @@ static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
{
VFIOAddressSpace *space = vfio_get_address_space(as);
VFIOContainer *container = NULL;
VFIOContainerBase *bcontainer = NULL;
if (QLIST_EMPTY(&space->containers)) {
/* No containers to act on */
goto out;
}
container = QLIST_FIRST(&space->containers);
bcontainer = QLIST_FIRST(&space->containers);
if (QLIST_NEXT(container, next)) {
if (QLIST_NEXT(bcontainer, next)) {
/*
* We don't yet have logic to synchronize EEH state across
* multiple containers
*/
container = NULL;
bcontainer = NULL;
goto out;
}
out:
vfio_put_address_space(space);
return container;
return container_of(bcontainer, VFIOContainer, bcontainer);
}
static bool vfio_eeh_as_ok(AddressSpace *as)
@ -314,3 +316,37 @@ int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb)
return RTAS_OUT_SUCCESS;
}
#else
bool spapr_phb_eeh_available(SpaprPhbState *sphb)
{
return false;
}
void spapr_phb_vfio_reset(DeviceState *qdev)
{
}
int spapr_phb_vfio_eeh_set_option(SpaprPhbState *sphb,
unsigned int addr, int option)
{
return RTAS_OUT_NOT_SUPPORTED;
}
int spapr_phb_vfio_eeh_get_state(SpaprPhbState *sphb, int *state)
{
return RTAS_OUT_NOT_SUPPORTED;
}
int spapr_phb_vfio_eeh_reset(SpaprPhbState *sphb, int option)
{
return RTAS_OUT_NOT_SUPPORTED;
}
int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb)
{
return RTAS_OUT_NOT_SUPPORTED;
}
#endif /* CONFIG_VFIO_PCI */

View File

@ -6,6 +6,7 @@ config S390_CCW_VIRTIO
imply VFIO_CCW
imply WDT_DIAG288
imply PCIE_DEVICES
imply IOMMUFD
select PCI_EXPRESS
select S390_FLIC
select S390_FLIC_KVM if KVM

View File

@ -11,10 +11,12 @@
*/
#include "qemu/osdep.h"
#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
#include <linux/vfio.h>
#include <sys/ioctl.h>
#include "qapi/error.h"
#include "hw/vfio/vfio-common.h"
#include "sysemu/iommufd.h"
#include "hw/s390x/ap-device.h"
#include "qemu/error-report.h"
#include "qemu/event_notifier.h"
@ -158,18 +160,9 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
VFIODevice *vbasedev = &vapdev->vdev;
vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
vbasedev->ops = &vfio_ap_ops;
vbasedev->type = VFIO_DEVICE_TYPE_AP;
vbasedev->dev = dev;
/*
* vfio-ap devices operate in a way compatible with discarding of
* memory in RAM blocks, as no pages are pinned in the host.
* This needs to be set before vfio_get_device() for vfio common to
* handle ram_block_discard_disable().
*/
vapdev->vdev.ram_block_discard_allowed = true;
if (vfio_device_get_name(vbasedev, errp) < 0) {
return;
}
ret = vfio_attach_device(vbasedev->name, vbasedev,
&address_space_memory, errp);
@ -204,6 +197,10 @@ static void vfio_ap_unrealize(DeviceState *dev)
static Property vfio_ap_properties[] = {
DEFINE_PROP_STRING("sysfsdev", VFIOAPDevice, vdev.sysfsdev),
#ifdef CONFIG_IOMMUFD
DEFINE_PROP_LINK("iommufd", VFIOAPDevice, vdev.iommufd,
TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
#endif
DEFINE_PROP_END_OF_LIST(),
};
@ -224,11 +221,36 @@ static const VMStateDescription vfio_ap_vmstate = {
.unmigratable = 1,
};
static void vfio_ap_instance_init(Object *obj)
{
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj);
VFIODevice *vbasedev = &vapdev->vdev;
/*
* vfio-ap devices operate in a way compatible with discarding of
* memory in RAM blocks, as no pages are pinned in the host.
* This needs to be set before vfio_get_device() for vfio common to
* handle ram_block_discard_disable().
*/
vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops,
DEVICE(vapdev), true);
}
#ifdef CONFIG_IOMMUFD
static void vfio_ap_set_fd(Object *obj, const char *str, Error **errp)
{
vfio_device_set_fd(&VFIO_AP_DEVICE(obj)->vdev, str, errp);
}
#endif
static void vfio_ap_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
device_class_set_props(dc, vfio_ap_properties);
#ifdef CONFIG_IOMMUFD
object_class_property_add_str(klass, "fd", NULL, vfio_ap_set_fd);
#endif
dc->vmsd = &vfio_ap_vmstate;
dc->desc = "VFIO-based AP device assignment";
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
@ -243,6 +265,7 @@ static const TypeInfo vfio_ap_info = {
.name = TYPE_VFIO_AP_DEVICE,
.parent = TYPE_AP_DEVICE,
.instance_size = sizeof(VFIOAPDevice),
.instance_init = vfio_ap_instance_init,
.class_init = vfio_ap_class_init,
};

View File

@ -15,12 +15,14 @@
*/
#include "qemu/osdep.h"
#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
#include <linux/vfio.h>
#include <linux/vfio_ccw.h>
#include <sys/ioctl.h>
#include "qapi/error.h"
#include "hw/vfio/vfio-common.h"
#include "sysemu/iommufd.h"
#include "hw/s390x/s390-ccw.h"
#include "hw/s390x/vfio-ccw.h"
#include "hw/qdev-properties.h"
@ -588,22 +590,9 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
}
}
vbasedev->ops = &vfio_ccw_ops;
vbasedev->type = VFIO_DEVICE_TYPE_CCW;
vbasedev->name = g_strdup_printf("%x.%x.%04x", vcdev->cdev.hostid.cssid,
vcdev->cdev.hostid.ssid,
vcdev->cdev.hostid.devid);
vbasedev->dev = dev;
/*
* All vfio-ccw devices are believed to operate in a way compatible with
* discarding of memory in RAM blocks, ie. pages pinned in the host are
* in the current working set of the guest driver and therefore never
* overlap e.g., with pages available to the guest balloon driver. This
* needs to be set before vfio_get_device() for vfio common to handle
* ram_block_discard_disable().
*/
vbasedev->ram_block_discard_allowed = true;
if (vfio_device_get_name(vbasedev, errp) < 0) {
return;
}
ret = vfio_attach_device(cdev->mdevid, vbasedev,
&address_space_memory, errp);
@ -677,6 +666,10 @@ static void vfio_ccw_unrealize(DeviceState *dev)
static Property vfio_ccw_properties[] = {
DEFINE_PROP_STRING("sysfsdev", VFIOCCWDevice, vdev.sysfsdev),
DEFINE_PROP_BOOL("force-orb-pfch", VFIOCCWDevice, force_orb_pfch, false),
#ifdef CONFIG_IOMMUFD
DEFINE_PROP_LINK("iommufd", VFIOCCWDevice, vdev.iommufd,
TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
#endif
DEFINE_PROP_END_OF_LIST(),
};
@ -685,12 +678,39 @@ static const VMStateDescription vfio_ccw_vmstate = {
.unmigratable = 1,
};
static void vfio_ccw_instance_init(Object *obj)
{
VFIOCCWDevice *vcdev = VFIO_CCW(obj);
VFIODevice *vbasedev = &vcdev->vdev;
/*
* All vfio-ccw devices are believed to operate in a way compatible with
* discarding of memory in RAM blocks, ie. pages pinned in the host are
* in the current working set of the guest driver and therefore never
* overlap e.g., with pages available to the guest balloon driver. This
* needs to be set before vfio_get_device() for vfio common to handle
* ram_block_discard_disable().
*/
vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops,
DEVICE(vcdev), true);
}
#ifdef CONFIG_IOMMUFD
static void vfio_ccw_set_fd(Object *obj, const char *str, Error **errp)
{
vfio_device_set_fd(&VFIO_CCW(obj)->vdev, str, errp);
}
#endif
static void vfio_ccw_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
S390CCWDeviceClass *cdc = S390_CCW_DEVICE_CLASS(klass);
device_class_set_props(dc, vfio_ccw_properties);
#ifdef CONFIG_IOMMUFD
object_class_property_add_str(klass, "fd", NULL, vfio_ccw_set_fd);
#endif
dc->vmsd = &vfio_ccw_vmstate;
dc->desc = "VFIO-based subchannel assignment";
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
@ -708,6 +728,7 @@ static const TypeInfo vfio_ccw_info = {
.name = TYPE_VFIO_CCW,
.parent = TYPE_S390_CCW,
.instance_size = sizeof(VFIOCCWDevice),
.instance_init = vfio_ccw_instance_init,
.class_init = vfio_ccw_class_init,
};

View File

@ -19,6 +19,7 @@
*/
#include "qemu/osdep.h"
#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
#include <sys/ioctl.h>
#ifdef CONFIG_KVM
#include <linux/kvm.h>
@ -145,7 +146,7 @@ void vfio_unblock_multiple_devices_migration(void)
bool vfio_viommu_preset(VFIODevice *vbasedev)
{
return vbasedev->container->space->as != &address_space_memory;
return vbasedev->bcontainer->space->as != &address_space_memory;
}
static void vfio_set_migration_error(int err)
@ -177,7 +178,7 @@ bool vfio_device_state_is_precopy(VFIODevice *vbasedev)
migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P;
}
static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer)
{
VFIODevice *vbasedev;
MigrationState *ms = migrate_get_current();
@ -187,7 +188,7 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
return false;
}
QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
VFIOMigration *migration = vbasedev->migration;
if (!migration) {
@ -203,11 +204,11 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
return true;
}
bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer)
{
VFIODevice *vbasedev;
QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
if (!vbasedev->dirty_pages_supported) {
return false;
}
@ -220,7 +221,8 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
* Check if all VFIO devices are running and migration is active, which is
* essentially equivalent to the migration being in pre-copy phase.
*/
bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
bool
vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer)
{
VFIODevice *vbasedev;
@ -228,7 +230,7 @@ bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
return false;
}
QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
VFIOMigration *migration = vbasedev->migration;
if (!migration) {
@ -292,7 +294,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
{
VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
VFIOContainer *container = giommu->container;
VFIOContainerBase *bcontainer = giommu->bcontainer;
hwaddr iova = iotlb->iova + giommu->iommu_offset;
void *vaddr;
int ret;
@ -322,21 +324,22 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
* of vaddr will always be there, even if the memory object is
* destroyed and its backing memory munmap-ed.
*/
ret = vfio_dma_map(container, iova,
iotlb->addr_mask + 1, vaddr,
read_only);
ret = vfio_container_dma_map(bcontainer, iova,
iotlb->addr_mask + 1, vaddr,
read_only);
if (ret) {
error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx", %p) = %d (%s)",
container, iova,
bcontainer, iova,
iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
}
} else {
ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
ret = vfio_container_dma_unmap(bcontainer, iova,
iotlb->addr_mask + 1, iotlb);
if (ret) {
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
container, iova,
bcontainer, iova,
iotlb->addr_mask + 1, ret, strerror(-ret));
vfio_set_migration_error(ret);
}
@ -350,14 +353,15 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
{
VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
listener);
VFIOContainerBase *bcontainer = vrdl->bcontainer;
const hwaddr size = int128_get64(section->size);
const hwaddr iova = section->offset_within_address_space;
int ret;
/* Unmap with a single call. */
ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL);
if (ret) {
error_report("%s: vfio_dma_unmap() failed: %s", __func__,
error_report("%s: vfio_container_dma_unmap() failed: %s", __func__,
strerror(-ret));
}
}
@ -367,6 +371,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
{
VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
listener);
VFIOContainerBase *bcontainer = vrdl->bcontainer;
const hwaddr end = section->offset_within_region +
int128_get64(section->size);
hwaddr start, next, iova;
@ -385,8 +390,8 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
section->offset_within_address_space;
vaddr = memory_region_get_ram_ptr(section->mr) + start;
ret = vfio_dma_map(vrdl->container, iova, next - start,
vaddr, section->readonly);
ret = vfio_container_dma_map(bcontainer, iova, next - start,
vaddr, section->readonly);
if (ret) {
/* Rollback */
vfio_ram_discard_notify_discard(rdl, section);
@ -396,7 +401,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
return 0;
}
static void vfio_register_ram_discard_listener(VFIOContainer *container,
static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer,
MemoryRegionSection *section)
{
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
@ -409,7 +414,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
vrdl = g_new0(VFIORamDiscardListener, 1);
vrdl->container = container;
vrdl->bcontainer = bcontainer;
vrdl->mr = section->mr;
vrdl->offset_within_address_space = section->offset_within_address_space;
vrdl->size = int128_get64(section->size);
@ -417,14 +422,14 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
section->mr);
g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
g_assert(container->pgsizes &&
vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
g_assert(bcontainer->pgsizes &&
vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes));
ram_discard_listener_init(&vrdl->listener,
vfio_ram_discard_notify_populate,
vfio_ram_discard_notify_discard, true);
ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next);
/*
* Sanity-check if we have a theoretically problematic setup where we could
@ -439,7 +444,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
* number of sections in the address space we could have over time,
* also consuming DMA mappings.
*/
if (container->dma_max_mappings) {
if (bcontainer->dma_max_mappings) {
unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
#ifdef CONFIG_KVM
@ -448,7 +453,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
}
#endif
QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
hwaddr start, end;
start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
@ -460,23 +465,23 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
}
if (vrdl_mappings + max_memslots - vrdl_count >
container->dma_max_mappings) {
bcontainer->dma_max_mappings) {
warn_report("%s: possibly running out of DMA mappings. E.g., try"
" increasing the 'block-size' of virtio-mem devies."
" Maximum possible DMA mappings: %d, Maximum possible"
" memslots: %d", __func__, container->dma_max_mappings,
" memslots: %d", __func__, bcontainer->dma_max_mappings,
max_memslots);
}
}
}
static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer,
MemoryRegionSection *section)
{
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
VFIORamDiscardListener *vrdl = NULL;
QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
if (vrdl->mr == section->mr &&
vrdl->offset_within_address_space ==
section->offset_within_address_space) {
@ -538,7 +543,7 @@ static bool vfio_listener_valid_section(MemoryRegionSection *section,
return true;
}
static bool vfio_get_section_iova_range(VFIOContainer *container,
static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
MemoryRegionSection *section,
hwaddr *out_iova, hwaddr *out_end,
Int128 *out_llend)
@ -566,7 +571,8 @@ static bool vfio_get_section_iova_range(VFIOContainer *container,
static void vfio_listener_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
listener);
hwaddr iova, end;
Int128 llend, llsize;
void *vaddr;
@ -577,7 +583,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
return;
}
if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
&llend)) {
if (memory_region_is_ram_device(section->mr)) {
trace_vfio_listener_region_add_no_dma_map(
memory_region_name(section->mr),
@ -588,7 +595,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
return;
}
if (vfio_container_add_section_window(container, section, &err)) {
if (vfio_container_add_section_window(bcontainer, section, &err)) {
goto fail;
}
@ -610,7 +617,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
giommu->iommu_mr = iommu_mr;
giommu->iommu_offset = section->offset_within_address_space -
section->offset_within_region;
giommu->container = container;
giommu->bcontainer = bcontainer;
llend = int128_add(int128_make64(section->offset_within_region),
section->size);
llend = int128_sub(llend, int128_one());
@ -623,16 +630,17 @@ static void vfio_listener_region_add(MemoryListener *listener,
iommu_idx);
ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr,
container->pgsizes,
bcontainer->pgsizes,
&err);
if (ret) {
g_free(giommu);
goto fail;
}
if (container->iova_ranges) {
if (bcontainer->iova_ranges) {
ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr,
container->iova_ranges, &err);
bcontainer->iova_ranges,
&err);
if (ret) {
g_free(giommu);
goto fail;
@ -645,7 +653,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
g_free(giommu);
goto fail;
}
QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next);
memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
return;
@ -659,7 +667,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
* about changes.
*/
if (memory_region_has_ram_discard_manager(section->mr)) {
vfio_register_ram_discard_listener(container, section);
vfio_register_ram_discard_listener(bcontainer, section);
return;
}
@ -672,7 +680,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
llsize = int128_sub(llend, int128_make64(iova));
if (memory_region_is_ram_device(section->mr)) {
hwaddr pgmask = (1ULL << ctz64(container->pgsizes)) - 1;
hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
trace_vfio_listener_region_add_no_dma_map(
@ -684,12 +692,12 @@ static void vfio_listener_region_add(MemoryListener *listener,
}
}
ret = vfio_dma_map(container, iova, int128_get64(llsize),
vaddr, section->readonly);
ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
vaddr, section->readonly);
if (ret) {
error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx", %p) = %d (%s)",
container, iova, int128_get64(llsize), vaddr, ret,
bcontainer, iova, int128_get64(llsize), vaddr, ret,
strerror(-ret));
if (memory_region_is_ram_device(section->mr)) {
/* Allow unexpected mappings not to be fatal for RAM devices */
@ -711,9 +719,9 @@ fail:
* can gracefully fail. Runtime, there's not much we can do other
* than throw a hardware error.
*/
if (!container->initialized) {
if (!container->error) {
error_propagate_prepend(&container->error, err,
if (!bcontainer->initialized) {
if (!bcontainer->error) {
error_propagate_prepend(&bcontainer->error, err,
"Region %s: ",
memory_region_name(section->mr));
} else {
@ -728,7 +736,8 @@ fail:
static void vfio_listener_region_del(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
listener);
hwaddr iova, end;
Int128 llend, llsize;
int ret;
@ -741,7 +750,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
if (memory_region_is_iommu(section->mr)) {
VFIOGuestIOMMU *giommu;
QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
giommu->n.start == section->offset_within_region) {
memory_region_unregister_iommu_notifier(section->mr,
@ -761,7 +770,8 @@ static void vfio_listener_region_del(MemoryListener *listener,
*/
}
if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
&llend)) {
return;
}
@ -772,10 +782,10 @@ static void vfio_listener_region_del(MemoryListener *listener,
if (memory_region_is_ram_device(section->mr)) {
hwaddr pgmask;
pgmask = (1ULL << ctz64(container->pgsizes)) - 1;
pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
} else if (memory_region_has_ram_discard_manager(section->mr)) {
vfio_unregister_ram_discard_listener(container, section);
vfio_unregister_ram_discard_listener(bcontainer, section);
/* Unregistering will trigger an unmap. */
try_unmap = false;
}
@ -784,27 +794,29 @@ static void vfio_listener_region_del(MemoryListener *listener,
if (int128_eq(llsize, int128_2_64())) {
/* The unmap ioctl doesn't accept a full 64-bit span. */
llsize = int128_rshift(llsize, 1);
ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
ret = vfio_container_dma_unmap(bcontainer, iova,
int128_get64(llsize), NULL);
if (ret) {
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
container, iova, int128_get64(llsize), ret,
bcontainer, iova, int128_get64(llsize), ret,
strerror(-ret));
}
iova += int128_get64(llsize);
}
ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
ret = vfio_container_dma_unmap(bcontainer, iova,
int128_get64(llsize), NULL);
if (ret) {
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
container, iova, int128_get64(llsize), ret,
bcontainer, iova, int128_get64(llsize), ret,
strerror(-ret));
}
}
memory_region_unref(section->mr);
vfio_container_del_section_window(container, section);
vfio_container_del_section_window(bcontainer, section);
}
typedef struct VFIODirtyRanges {
@ -817,13 +829,13 @@ typedef struct VFIODirtyRanges {
} VFIODirtyRanges;
typedef struct VFIODirtyRangesListener {
VFIOContainer *container;
VFIOContainerBase *bcontainer;
VFIODirtyRanges ranges;
MemoryListener listener;
} VFIODirtyRangesListener;
static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
VFIOContainer *container)
VFIOContainerBase *bcontainer)
{
VFIOPCIDevice *pcidev;
VFIODevice *vbasedev;
@ -831,7 +843,7 @@ static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
owner = memory_region_owner(section->mr);
QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
continue;
}
@ -854,7 +866,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener,
hwaddr iova, end, *min, *max;
if (!vfio_listener_valid_section(section, "tracking_update") ||
!vfio_get_section_iova_range(dirty->container, section,
!vfio_get_section_iova_range(dirty->bcontainer, section,
&iova, &end, NULL)) {
return;
}
@ -878,7 +890,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener,
* The alternative would be an IOVATree but that has a much bigger runtime
* overhead and unnecessary complexity.
*/
if (vfio_section_is_vfio_pci(section, dirty->container) &&
if (vfio_section_is_vfio_pci(section, dirty->bcontainer) &&
iova >= UINT32_MAX) {
min = &range->minpci64;
max = &range->maxpci64;
@ -902,7 +914,7 @@ static const MemoryListener vfio_dirty_tracking_listener = {
.region_add = vfio_dirty_tracking_update,
};
static void vfio_dirty_tracking_init(VFIOContainer *container,
static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer,
VFIODirtyRanges *ranges)
{
VFIODirtyRangesListener dirty;
@ -912,10 +924,10 @@ static void vfio_dirty_tracking_init(VFIOContainer *container,
dirty.ranges.min64 = UINT64_MAX;
dirty.ranges.minpci64 = UINT64_MAX;
dirty.listener = vfio_dirty_tracking_listener;
dirty.container = container;
dirty.bcontainer = bcontainer;
memory_listener_register(&dirty.listener,
container->space->as);
bcontainer->space->as);
*ranges = dirty.ranges;
@ -927,7 +939,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container,
memory_listener_unregister(&dirty.listener);
}
static void vfio_devices_dma_logging_stop(VFIOContainer *container)
static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
{
uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
sizeof(uint64_t))] = {};
@ -938,7 +950,7 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container)
feature->flags = VFIO_DEVICE_FEATURE_SET |
VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
if (!vbasedev->dirty_tracking) {
continue;
}
@ -952,7 +964,7 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container)
}
static struct vfio_device_feature *
vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer,
VFIODirtyRanges *tracking)
{
struct vfio_device_feature *feature;
@ -1025,21 +1037,21 @@ static void vfio_device_feature_dma_logging_start_destroy(
g_free(feature);
}
static int vfio_devices_dma_logging_start(VFIOContainer *container)
static int vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer)
{
struct vfio_device_feature *feature;
VFIODirtyRanges ranges;
VFIODevice *vbasedev;
int ret = 0;
vfio_dirty_tracking_init(container, &ranges);
feature = vfio_device_feature_dma_logging_start_create(container,
vfio_dirty_tracking_init(bcontainer, &ranges);
feature = vfio_device_feature_dma_logging_start_create(bcontainer,
&ranges);
if (!feature) {
return -errno;
}
QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
if (vbasedev->dirty_tracking) {
continue;
}
@ -1056,7 +1068,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container)
out:
if (ret) {
vfio_devices_dma_logging_stop(container);
vfio_devices_dma_logging_stop(bcontainer);
}
vfio_device_feature_dma_logging_start_destroy(feature);
@ -1066,13 +1078,14 @@ out:
static void vfio_listener_log_global_start(MemoryListener *listener)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
listener);
int ret;
if (vfio_devices_all_device_dirty_tracking(container)) {
ret = vfio_devices_dma_logging_start(container);
if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
ret = vfio_devices_dma_logging_start(bcontainer);
} else {
ret = vfio_set_dirty_page_tracking(container, true);
ret = vfio_container_set_dirty_page_tracking(bcontainer, true);
}
if (ret) {
@ -1084,13 +1097,14 @@ static void vfio_listener_log_global_start(MemoryListener *listener)
static void vfio_listener_log_global_stop(MemoryListener *listener)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
listener);
int ret = 0;
if (vfio_devices_all_device_dirty_tracking(container)) {
vfio_devices_dma_logging_stop(container);
if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
vfio_devices_dma_logging_stop(bcontainer);
} else {
ret = vfio_set_dirty_page_tracking(container, false);
ret = vfio_container_set_dirty_page_tracking(bcontainer, false);
}
if (ret) {
@ -1126,14 +1140,14 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
return 0;
}
int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap, hwaddr iova,
hwaddr size)
{
VFIODevice *vbasedev;
int ret;
QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
ret = vfio_device_dma_logging_report(vbasedev, iova, size,
vbmap->bitmap);
if (ret) {
@ -1149,16 +1163,16 @@ int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
return 0;
}
int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
uint64_t size, ram_addr_t ram_addr)
{
bool all_device_dirty_tracking =
vfio_devices_all_device_dirty_tracking(container);
vfio_devices_all_device_dirty_tracking(bcontainer);
uint64_t dirty_pages;
VFIOBitmap vbmap;
int ret;
if (!container->dirty_pages_supported && !all_device_dirty_tracking) {
if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) {
cpu_physical_memory_set_dirty_range(ram_addr, size,
tcg_enabled() ? DIRTY_CLIENTS_ALL :
DIRTY_CLIENTS_NOCODE);
@ -1171,9 +1185,9 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
}
if (all_device_dirty_tracking) {
ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size);
ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size);
} else {
ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size);
ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size);
}
if (ret) {
@ -1183,8 +1197,7 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
vbmap.pages);
trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size,
ram_addr, dirty_pages);
trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages);
out:
g_free(vbmap.bitmap);
@ -1201,7 +1214,7 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
vfio_giommu_dirty_notifier *gdn = container_of(n,
vfio_giommu_dirty_notifier, n);
VFIOGuestIOMMU *giommu = gdn->giommu;
VFIOContainer *container = giommu->container;
VFIOContainerBase *bcontainer = giommu->bcontainer;
hwaddr iova = iotlb->iova + giommu->iommu_offset;
ram_addr_t translated_addr;
int ret = -EINVAL;
@ -1216,12 +1229,12 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
rcu_read_lock();
if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
translated_addr);
if (ret) {
error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
container, iova, iotlb->addr_mask + 1, ret,
bcontainer, iova, iotlb->addr_mask + 1, ret,
strerror(-ret));
}
}
@ -1246,16 +1259,17 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
* Sync the whole mapped region (spanning multiple individual mappings)
* in one go.
*/
return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
return vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr);
}
static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
MemoryRegionSection *section)
static int
vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer,
MemoryRegionSection *section)
{
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
VFIORamDiscardListener *vrdl = NULL;
QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
if (vrdl->mr == section->mr &&
vrdl->offset_within_address_space ==
section->offset_within_address_space) {
@ -1276,7 +1290,7 @@ static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
&vrdl);
}
static int vfio_sync_dirty_bitmap(VFIOContainer *container,
static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer,
MemoryRegionSection *section)
{
ram_addr_t ram_addr;
@ -1284,7 +1298,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
if (memory_region_is_iommu(section->mr)) {
VFIOGuestIOMMU *giommu;
QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
giommu->n.start == section->offset_within_region) {
Int128 llend;
@ -1308,13 +1322,13 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
}
return 0;
} else if (memory_region_has_ram_discard_manager(section->mr)) {
return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
return vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section);
}
ram_addr = memory_region_get_ram_addr(section->mr) +
section->offset_within_region;
return vfio_get_dirty_bitmap(container,
return vfio_get_dirty_bitmap(bcontainer,
REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
int128_get64(section->size), ram_addr);
}
@ -1322,15 +1336,16 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
static void vfio_listener_log_sync(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
listener);
int ret;
if (vfio_listener_skipped_section(section)) {
return;
}
if (vfio_devices_all_dirty_tracking(container)) {
ret = vfio_sync_dirty_bitmap(container, section);
if (vfio_devices_all_dirty_tracking(bcontainer)) {
ret = vfio_sync_dirty_bitmap(bcontainer, section);
if (ret) {
error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret,
strerror(-ret));
@ -1449,10 +1464,13 @@ VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
void vfio_put_address_space(VFIOAddressSpace *space)
{
if (QLIST_EMPTY(&space->containers)) {
QLIST_REMOVE(space, list);
g_free(space);
if (!QLIST_EMPTY(&space->containers)) {
return;
}
QLIST_REMOVE(space, list);
g_free(space);
if (QLIST_EMPTY(&vfio_address_spaces)) {
qemu_unregister_reset(vfio_reset_handler, NULL);
}
@ -1481,3 +1499,24 @@ retry:
return info;
}
int vfio_attach_device(char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp)
{
const VFIOIOMMUOps *ops = &vfio_legacy_ops;
#ifdef CONFIG_IOMMUFD
if (vbasedev->iommufd) {
ops = &vfio_iommufd_ops;
}
#endif
return ops->attach_device(name, vbasedev, as, errp);
}
void vfio_detach_device(VFIODevice *vbasedev)
{
if (!vbasedev->bcontainer) {
return;
}
vbasedev->bcontainer->ops->detach_device(vbasedev);
}

101
hw/vfio/container-base.c Normal file
View File

@ -0,0 +1,101 @@
/*
* VFIO BASE CONTAINER
*
* Copyright (C) 2023 Intel Corporation.
* Copyright Red Hat, Inc. 2023
*
* Authors: Yi Liu <yi.l.liu@intel.com>
* Eric Auger <eric.auger@redhat.com>
*
* SPDX-License-Identifier: GPL-2.0-or-later
*/
#include "qemu/osdep.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "hw/vfio/vfio-container-base.h"
int vfio_container_dma_map(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly)
{
g_assert(bcontainer->ops->dma_map);
return bcontainer->ops->dma_map(bcontainer, iova, size, vaddr, readonly);
}
int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb)
{
g_assert(bcontainer->ops->dma_unmap);
return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb);
}
int vfio_container_add_section_window(VFIOContainerBase *bcontainer,
MemoryRegionSection *section,
Error **errp)
{
if (!bcontainer->ops->add_window) {
return 0;
}
return bcontainer->ops->add_window(bcontainer, section, errp);
}
void vfio_container_del_section_window(VFIOContainerBase *bcontainer,
MemoryRegionSection *section)
{
if (!bcontainer->ops->del_window) {
return;
}
return bcontainer->ops->del_window(bcontainer, section);
}
int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
bool start)
{
if (!bcontainer->dirty_pages_supported) {
return 0;
}
g_assert(bcontainer->ops->set_dirty_page_tracking);
return bcontainer->ops->set_dirty_page_tracking(bcontainer, start);
}
int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap,
hwaddr iova, hwaddr size)
{
g_assert(bcontainer->ops->query_dirty_bitmap);
return bcontainer->ops->query_dirty_bitmap(bcontainer, vbmap, iova, size);
}
void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space,
const VFIOIOMMUOps *ops)
{
bcontainer->ops = ops;
bcontainer->space = space;
bcontainer->error = NULL;
bcontainer->dirty_pages_supported = false;
bcontainer->dma_max_mappings = 0;
bcontainer->iova_ranges = NULL;
QLIST_INIT(&bcontainer->giommu_list);
QLIST_INIT(&bcontainer->vrdl_list);
}
void vfio_container_destroy(VFIOContainerBase *bcontainer)
{
VFIOGuestIOMMU *giommu, *tmp;
QLIST_REMOVE(bcontainer, next);
QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) {
memory_region_unregister_iommu_notifier(
MEMORY_REGION(giommu->iommu_mr), &giommu->n);
QLIST_REMOVE(giommu, giommu_next);
g_free(giommu);
}
g_list_free_full(bcontainer->iova_ranges, g_free);
}

View File

@ -33,6 +33,7 @@
#include "trace.h"
#include "qapi/error.h"
#include "migration/migration.h"
#include "pci.h"
VFIOGroupList vfio_group_list =
QLIST_HEAD_INITIALIZER(vfio_group_list);
@ -60,10 +61,11 @@ static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
}
}
static int vfio_dma_unmap_bitmap(VFIOContainer *container,
static int vfio_dma_unmap_bitmap(const VFIOContainer *container,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb)
{
const VFIOContainerBase *bcontainer = &container->bcontainer;
struct vfio_iommu_type1_dma_unmap *unmap;
struct vfio_bitmap *bitmap;
VFIOBitmap vbmap;
@ -91,7 +93,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
bitmap->size = vbmap.size;
bitmap->data = (__u64 *)vbmap.bitmap;
if (vbmap.size > container->max_dirty_bitmap_size) {
if (vbmap.size > bcontainer->max_dirty_bitmap_size) {
error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
ret = -E2BIG;
goto unmap_exit;
@ -115,9 +117,12 @@ unmap_exit:
/*
* DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
*/
int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
ram_addr_t size, IOMMUTLBEntry *iotlb)
static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb)
{
const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
bcontainer);
struct vfio_iommu_type1_dma_unmap unmap = {
.argsz = sizeof(unmap),
.flags = 0,
@ -127,9 +132,9 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
bool need_dirty_sync = false;
int ret;
if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
if (!vfio_devices_all_device_dirty_tracking(container) &&
container->dirty_pages_supported) {
if (iotlb && vfio_devices_all_running_and_mig_active(bcontainer)) {
if (!vfio_devices_all_device_dirty_tracking(bcontainer) &&
bcontainer->dirty_pages_supported) {
return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
}
@ -151,8 +156,8 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
*/
if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
container->iommu_type == VFIO_TYPE1v2_IOMMU) {
trace_vfio_dma_unmap_overflow_workaround();
unmap.size -= 1ULL << ctz64(container->pgsizes);
trace_vfio_legacy_dma_unmap_overflow_workaround();
unmap.size -= 1ULL << ctz64(bcontainer->pgsizes);
continue;
}
error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
@ -160,7 +165,7 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
}
if (need_dirty_sync) {
ret = vfio_get_dirty_bitmap(container, iova, size,
ret = vfio_get_dirty_bitmap(bcontainer, iova, size,
iotlb->translated_addr);
if (ret) {
return ret;
@ -170,9 +175,11 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
return 0;
}
int vfio_dma_map(VFIOContainer *container, hwaddr iova,
ram_addr_t size, void *vaddr, bool readonly)
static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
ram_addr_t size, void *vaddr, bool readonly)
{
const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
bcontainer);
struct vfio_iommu_type1_dma_map map = {
.argsz = sizeof(map),
.flags = VFIO_DMA_MAP_FLAG_READ,
@ -191,7 +198,8 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova,
* the VGA ROM space.
*/
if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
(errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
(errno == EBUSY &&
vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 &&
ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
return 0;
}
@ -200,17 +208,17 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova,
return -errno;
}
int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
static int
vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer,
bool start)
{
const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
bcontainer);
int ret;
struct vfio_iommu_type1_dirty_bitmap dirty = {
.argsz = sizeof(dirty),
};
if (!container->dirty_pages_supported) {
return 0;
}
if (start) {
dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
} else {
@ -227,9 +235,12 @@ int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
return ret;
}
int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
hwaddr iova, hwaddr size)
static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap,
hwaddr iova, hwaddr size)
{
const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
bcontainer);
struct vfio_iommu_type1_dirty_bitmap *dbitmap;
struct vfio_iommu_type1_dirty_bitmap_get *range;
int ret;
@ -296,7 +307,7 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
}
static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
VFIOContainer *container)
VFIOContainerBase *bcontainer)
{
struct vfio_info_cap_header *hdr;
struct vfio_iommu_type1_info_cap_iova_range *cap;
@ -314,8 +325,8 @@ static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
range_set_bounds(range, cap->iova_ranges[i].start,
cap->iova_ranges[i].end);
container->iova_ranges =
range_list_insert(container->iova_ranges, range);
bcontainer->iova_ranges =
range_list_insert(bcontainer->iova_ranges, range);
}
return true;
@ -442,6 +453,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container,
{
struct vfio_info_cap_header *hdr;
struct vfio_iommu_type1_info_cap_migration *cap_mig;
VFIOContainerBase *bcontainer = &container->bcontainer;
hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
if (!hdr) {
@ -456,22 +468,17 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container,
* qemu_real_host_page_size to mark those dirty.
*/
if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
container->dirty_pages_supported = true;
container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
container->dirty_pgsizes = cap_mig->pgsize_bitmap;
bcontainer->dirty_pages_supported = true;
bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap;
}
}
static void vfio_free_container(VFIOContainer *container)
{
g_list_free_full(container->iova_ranges, g_free);
g_free(container);
}
static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
Error **errp)
{
VFIOContainer *container;
VFIOContainerBase *bcontainer;
int ret, fd;
VFIOAddressSpace *space;
@ -508,7 +515,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
* details once we know which type of IOMMU we are using.
*/
QLIST_FOREACH(container, &space->containers, next) {
QLIST_FOREACH(bcontainer, &space->containers, next) {
container = container_of(bcontainer, VFIOContainer, bcontainer);
if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
ret = vfio_ram_block_discard_disable(container, true);
if (ret) {
@ -544,14 +552,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
}
container = g_malloc0(sizeof(*container));
container->space = space;
container->fd = fd;
container->error = NULL;
container->dirty_pages_supported = false;
container->dma_max_mappings = 0;
container->iova_ranges = NULL;
QLIST_INIT(&container->giommu_list);
QLIST_INIT(&container->vrdl_list);
bcontainer = &container->bcontainer;
vfio_container_init(bcontainer, space, &vfio_legacy_ops);
ret = vfio_init_container(container, group->fd, errp);
if (ret) {
@ -577,16 +580,16 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
}
if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
container->pgsizes = info->iova_pgsizes;
bcontainer->pgsizes = info->iova_pgsizes;
} else {
container->pgsizes = qemu_real_host_page_size();
bcontainer->pgsizes = qemu_real_host_page_size();
}
if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
container->dma_max_mappings = 65535;
if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) {
bcontainer->dma_max_mappings = 65535;
}
vfio_get_info_iova_range(info, container);
vfio_get_info_iova_range(info, bcontainer);
vfio_get_iommu_info_migration(container, info);
g_free(info);
@ -606,30 +609,29 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
vfio_kvm_device_add_group(group);
QLIST_INIT(&container->group_list);
QLIST_INSERT_HEAD(&space->containers, container, next);
QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
group->container = container;
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
container->listener = vfio_memory_listener;
bcontainer->listener = vfio_memory_listener;
memory_listener_register(&bcontainer->listener, bcontainer->space->as);
memory_listener_register(&container->listener, container->space->as);
if (container->error) {
if (bcontainer->error) {
ret = -1;
error_propagate_prepend(errp, container->error,
error_propagate_prepend(errp, bcontainer->error,
"memory listener initialization failed: ");
goto listener_release_exit;
}
container->initialized = true;
bcontainer->initialized = true;
return 0;
listener_release_exit:
QLIST_REMOVE(group, container_next);
QLIST_REMOVE(container, next);
QLIST_REMOVE(bcontainer, next);
vfio_kvm_device_del_group(group);
memory_listener_unregister(&container->listener);
memory_listener_unregister(&bcontainer->listener);
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
vfio_spapr_container_deinit(container);
@ -639,7 +641,7 @@ enable_discards_exit:
vfio_ram_block_discard_disable(container, false);
free_container_exit:
vfio_free_container(container);
g_free(container);
close_fd_exit:
close(fd);
@ -653,6 +655,7 @@ put_space_exit:
static void vfio_disconnect_container(VFIOGroup *group)
{
VFIOContainer *container = group->container;
VFIOContainerBase *bcontainer = &container->bcontainer;
QLIST_REMOVE(group, container_next);
group->container = NULL;
@ -663,7 +666,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
* group.
*/
if (QLIST_EMPTY(&container->group_list)) {
memory_listener_unregister(&container->listener);
memory_listener_unregister(&bcontainer->listener);
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
vfio_spapr_container_deinit(container);
@ -676,21 +679,13 @@ static void vfio_disconnect_container(VFIOGroup *group)
}
if (QLIST_EMPTY(&container->group_list)) {
VFIOAddressSpace *space = container->space;
VFIOGuestIOMMU *giommu, *tmp;
VFIOAddressSpace *space = bcontainer->space;
QLIST_REMOVE(container, next);
QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
memory_region_unregister_iommu_notifier(
MEMORY_REGION(giommu->iommu_mr), &giommu->n);
QLIST_REMOVE(giommu, giommu_next);
g_free(giommu);
}
vfio_container_destroy(bcontainer);
trace_vfio_disconnect_container(container->fd);
close(container->fd);
vfio_free_container(container);
g_free(container);
vfio_put_address_space(space);
}
@ -705,7 +700,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
QLIST_FOREACH(group, &vfio_group_list, next) {
if (group->groupid == groupid) {
/* Found it. Now is it already in the right context? */
if (group->container->space->as == as) {
if (group->container->bcontainer.space->as == as) {
return group;
} else {
error_setg(errp, "group %d used in multiple address spaces",
@ -877,13 +872,13 @@ static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
* @name and @vbasedev->name are likely to be different depending
* on the type of the device, hence the need for passing @name
*/
int vfio_attach_device(char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp)
static int vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp)
{
int groupid = vfio_device_groupid(vbasedev, errp);
VFIODevice *vbasedev_iter;
VFIOGroup *group;
VFIOContainer *container;
VFIOContainerBase *bcontainer;
int ret;
if (groupid < 0) {
@ -910,26 +905,200 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev,
return ret;
}
container = group->container;
vbasedev->container = container;
QLIST_INSERT_HEAD(&container->device_list, vbasedev, container_next);
bcontainer = &group->container->bcontainer;
vbasedev->bcontainer = bcontainer;
QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
return ret;
}
void vfio_detach_device(VFIODevice *vbasedev)
static void vfio_legacy_detach_device(VFIODevice *vbasedev)
{
VFIOGroup *group = vbasedev->group;
if (!vbasedev->container) {
return;
}
QLIST_REMOVE(vbasedev, global_next);
QLIST_REMOVE(vbasedev, container_next);
vbasedev->container = NULL;
vbasedev->bcontainer = NULL;
trace_vfio_detach_device(vbasedev->name, group->groupid);
vfio_put_base_device(vbasedev);
vfio_put_group(group);
}
static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single)
{
VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
VFIOGroup *group;
struct vfio_pci_hot_reset_info *info = NULL;
struct vfio_pci_dependent_device *devices;
struct vfio_pci_hot_reset *reset;
int32_t *fds;
int ret, i, count;
bool multi = false;
trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
if (!single) {
vfio_pci_pre_reset(vdev);
}
vdev->vbasedev.needs_reset = false;
ret = vfio_pci_get_pci_hot_reset_info(vdev, &info);
if (ret) {
goto out_single;
}
devices = &info->devices[0];
trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
/* Verify that we have all the groups required */
for (i = 0; i < info->count; i++) {
PCIHostDeviceAddress host;
VFIOPCIDevice *tmp;
VFIODevice *vbasedev_iter;
host.domain = devices[i].segment;
host.bus = devices[i].bus;
host.slot = PCI_SLOT(devices[i].devfn);
host.function = PCI_FUNC(devices[i].devfn);
trace_vfio_pci_hot_reset_dep_devices(host.domain,
host.bus, host.slot, host.function, devices[i].group_id);
if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
continue;
}
QLIST_FOREACH(group, &vfio_group_list, next) {
if (group->groupid == devices[i].group_id) {
break;
}
}
if (!group) {
if (!vdev->has_pm_reset) {
error_report("vfio: Cannot reset device %s, "
"depends on group %d which is not owned.",
vdev->vbasedev.name, devices[i].group_id);
}
ret = -EPERM;
goto out;
}
/* Prep dependent devices for reset and clear our marker. */
QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
if (!vbasedev_iter->dev->realized ||
vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
continue;
}
tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
if (single) {
ret = -EINVAL;
goto out_single;
}
vfio_pci_pre_reset(tmp);
tmp->vbasedev.needs_reset = false;
multi = true;
break;
}
}
}
if (!single && !multi) {
ret = -EINVAL;
goto out_single;
}
/* Determine how many group fds need to be passed */
count = 0;
QLIST_FOREACH(group, &vfio_group_list, next) {
for (i = 0; i < info->count; i++) {
if (group->groupid == devices[i].group_id) {
count++;
break;
}
}
}
reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
fds = &reset->group_fds[0];
/* Fill in group fds */
QLIST_FOREACH(group, &vfio_group_list, next) {
for (i = 0; i < info->count; i++) {
if (group->groupid == devices[i].group_id) {
fds[reset->count++] = group->fd;
break;
}
}
}
/* Bus reset! */
ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
g_free(reset);
if (ret) {
ret = -errno;
}
trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
ret ? strerror(errno) : "Success");
out:
/* Re-enable INTx on affected devices */
for (i = 0; i < info->count; i++) {
PCIHostDeviceAddress host;
VFIOPCIDevice *tmp;
VFIODevice *vbasedev_iter;
host.domain = devices[i].segment;
host.bus = devices[i].bus;
host.slot = PCI_SLOT(devices[i].devfn);
host.function = PCI_FUNC(devices[i].devfn);
if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
continue;
}
QLIST_FOREACH(group, &vfio_group_list, next) {
if (group->groupid == devices[i].group_id) {
break;
}
}
if (!group) {
break;
}
QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
if (!vbasedev_iter->dev->realized ||
vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
continue;
}
tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
vfio_pci_post_reset(tmp);
break;
}
}
}
out_single:
if (!single) {
vfio_pci_post_reset(vdev);
}
g_free(info);
return ret;
}
const VFIOIOMMUOps vfio_legacy_ops = {
.dma_map = vfio_legacy_dma_map,
.dma_unmap = vfio_legacy_dma_unmap,
.attach_device = vfio_legacy_attach_device,
.detach_device = vfio_legacy_detach_device,
.set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking,
.query_dirty_bitmap = vfio_legacy_query_dirty_bitmap,
.pci_hot_reset = vfio_legacy_pci_hot_reset,
};

View File

@ -27,6 +27,7 @@
#include "trace.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "monitor/monitor.h"
/*
* Common VFIO interrupt disable
@ -609,3 +610,56 @@ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
return ret;
}
int vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
{
struct stat st;
if (vbasedev->fd < 0) {
if (stat(vbasedev->sysfsdev, &st) < 0) {
error_setg_errno(errp, errno, "no such host device");
error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
return -errno;
}
/* User may specify a name, e.g: VFIO platform device */
if (!vbasedev->name) {
vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
}
} else {
if (!vbasedev->iommufd) {
error_setg(errp, "Use FD passing only with iommufd backend");
return -EINVAL;
}
/*
* Give a name with fd so any function printing out vbasedev->name
* will not break.
*/
if (!vbasedev->name) {
vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd);
}
}
return 0;
}
void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
{
int fd = monitor_fd_param(monitor_cur(), str, errp);
if (fd < 0) {
error_prepend(errp, "Could not parse remote object fd %s:", str);
return;
}
vbasedev->fd = fd;
}
void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
DeviceState *dev, bool ram_discard)
{
vbasedev->type = type;
vbasedev->ops = ops;
vbasedev->dev = dev;
vbasedev->fd = -1;
vbasedev->ram_block_discard_allowed = ram_discard;
}

630
hw/vfio/iommufd.c Normal file
View File

@ -0,0 +1,630 @@
/*
* iommufd container backend
*
* Copyright (C) 2023 Intel Corporation.
* Copyright Red Hat, Inc. 2023
*
* Authors: Yi Liu <yi.l.liu@intel.com>
* Eric Auger <eric.auger@redhat.com>
*
* SPDX-License-Identifier: GPL-2.0-or-later
*/
#include "qemu/osdep.h"
#include <sys/ioctl.h>
#include <linux/vfio.h>
#include <linux/iommufd.h>
#include "hw/vfio/vfio-common.h"
#include "qemu/error-report.h"
#include "trace.h"
#include "qapi/error.h"
#include "sysemu/iommufd.h"
#include "hw/qdev-core.h"
#include "sysemu/reset.h"
#include "qemu/cutils.h"
#include "qemu/chardev_open.h"
#include "pci.h"
static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova,
ram_addr_t size, void *vaddr, bool readonly)
{
const VFIOIOMMUFDContainer *container =
container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
return iommufd_backend_map_dma(container->be,
container->ioas_id,
iova, size, vaddr, readonly);
}
static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb)
{
const VFIOIOMMUFDContainer *container =
container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
/* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */
return iommufd_backend_unmap_dma(container->be,
container->ioas_id, iova, size);
}
static int iommufd_cdev_kvm_device_add(VFIODevice *vbasedev, Error **errp)
{
return vfio_kvm_device_add_fd(vbasedev->fd, errp);
}
static void iommufd_cdev_kvm_device_del(VFIODevice *vbasedev)
{
Error *err = NULL;
if (vfio_kvm_device_del_fd(vbasedev->fd, &err)) {
error_report_err(err);
}
}
static int iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp)
{
IOMMUFDBackend *iommufd = vbasedev->iommufd;
struct vfio_device_bind_iommufd bind = {
.argsz = sizeof(bind),
.flags = 0,
};
int ret;
ret = iommufd_backend_connect(iommufd, errp);
if (ret) {
return ret;
}
/*
* Add device to kvm-vfio to be prepared for the tracking
* in KVM. Especially for some emulated devices, it requires
* to have kvm information in the device open.
*/
ret = iommufd_cdev_kvm_device_add(vbasedev, errp);
if (ret) {
goto err_kvm_device_add;
}
/* Bind device to iommufd */
bind.iommufd = iommufd->fd;
ret = ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind);
if (ret) {
error_setg_errno(errp, errno, "error bind device fd=%d to iommufd=%d",
vbasedev->fd, bind.iommufd);
goto err_bind;
}
vbasedev->devid = bind.out_devid;
trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name,
vbasedev->fd, vbasedev->devid);
return ret;
err_bind:
iommufd_cdev_kvm_device_del(vbasedev);
err_kvm_device_add:
iommufd_backend_disconnect(iommufd);
return ret;
}
static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev)
{
/* Unbind is automatically conducted when device fd is closed */
iommufd_cdev_kvm_device_del(vbasedev);
iommufd_backend_disconnect(vbasedev->iommufd);
}
static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp)
{
long int ret = -ENOTTY;
char *path, *vfio_dev_path = NULL, *vfio_path = NULL;
DIR *dir = NULL;
struct dirent *dent;
gchar *contents;
struct stat st;
gsize length;
int major, minor;
dev_t vfio_devt;
path = g_strdup_printf("%s/vfio-dev", sysfs_path);
if (stat(path, &st) < 0) {
error_setg_errno(errp, errno, "no such host device");
goto out_free_path;
}
dir = opendir(path);
if (!dir) {
error_setg_errno(errp, errno, "couldn't open directory %s", path);
goto out_free_path;
}
while ((dent = readdir(dir))) {
if (!strncmp(dent->d_name, "vfio", 4)) {
vfio_dev_path = g_strdup_printf("%s/%s/dev", path, dent->d_name);
break;
}
}
if (!vfio_dev_path) {
error_setg(errp, "failed to find vfio-dev/vfioX/dev");
goto out_close_dir;
}
if (!g_file_get_contents(vfio_dev_path, &contents, &length, NULL)) {
error_setg(errp, "failed to load \"%s\"", vfio_dev_path);
goto out_free_dev_path;
}
if (sscanf(contents, "%d:%d", &major, &minor) != 2) {
error_setg(errp, "failed to get major:minor for \"%s\"", vfio_dev_path);
goto out_free_dev_path;
}
g_free(contents);
vfio_devt = makedev(major, minor);
vfio_path = g_strdup_printf("/dev/vfio/devices/%s", dent->d_name);
ret = open_cdev(vfio_path, vfio_devt);
if (ret < 0) {
error_setg(errp, "Failed to open %s", vfio_path);
}
trace_iommufd_cdev_getfd(vfio_path, ret);
g_free(vfio_path);
out_free_dev_path:
g_free(vfio_dev_path);
out_close_dir:
closedir(dir);
out_free_path:
if (*errp) {
error_prepend(errp, VFIO_MSG_PREFIX, path);
}
g_free(path);
return ret;
}
static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id,
Error **errp)
{
int ret, iommufd = vbasedev->iommufd->fd;
struct vfio_device_attach_iommufd_pt attach_data = {
.argsz = sizeof(attach_data),
.flags = 0,
.pt_id = id,
};
/* Attach device to an IOAS or hwpt within iommufd */
ret = ioctl(vbasedev->fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data);
if (ret) {
error_setg_errno(errp, errno,
"[iommufd=%d] error attach %s (%d) to id=%d",
iommufd, vbasedev->name, vbasedev->fd, id);
} else {
trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name,
vbasedev->fd, id);
}
return ret;
}
static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp)
{
int ret, iommufd = vbasedev->iommufd->fd;
struct vfio_device_detach_iommufd_pt detach_data = {
.argsz = sizeof(detach_data),
.flags = 0,
};
ret = ioctl(vbasedev->fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach_data);
if (ret) {
error_setg_errno(errp, errno, "detach %s failed", vbasedev->name);
} else {
trace_iommufd_cdev_detach_ioas_hwpt(iommufd, vbasedev->name);
}
return ret;
}
static int iommufd_cdev_attach_container(VFIODevice *vbasedev,
VFIOIOMMUFDContainer *container,
Error **errp)
{
return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
}
static void iommufd_cdev_detach_container(VFIODevice *vbasedev,
VFIOIOMMUFDContainer *container)
{
Error *err = NULL;
if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) {
error_report_err(err);
}
}
static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container)
{
VFIOContainerBase *bcontainer = &container->bcontainer;
if (!QLIST_EMPTY(&bcontainer->device_list)) {
return;
}
memory_listener_unregister(&bcontainer->listener);
vfio_container_destroy(bcontainer);
iommufd_backend_free_id(container->be, container->ioas_id);
g_free(container);
}
static int iommufd_cdev_ram_block_discard_disable(bool state)
{
/*
* We support coordinated discarding of RAM via the RamDiscardManager.
*/
return ram_block_uncoordinated_discard_disable(state);
}
static int iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer *container,
uint32_t ioas_id, Error **errp)
{
VFIOContainerBase *bcontainer = &container->bcontainer;
struct iommu_ioas_iova_ranges *info;
struct iommu_iova_range *iova_ranges;
int ret, sz, fd = container->be->fd;
info = g_malloc0(sizeof(*info));
info->size = sizeof(*info);
info->ioas_id = ioas_id;
ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info);
if (ret && errno != EMSGSIZE) {
goto error;
}
sz = info->num_iovas * sizeof(struct iommu_iova_range);
info = g_realloc(info, sizeof(*info) + sz);
info->allowed_iovas = (uintptr_t)(info + 1);
ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info);
if (ret) {
goto error;
}
iova_ranges = (struct iommu_iova_range *)(uintptr_t)info->allowed_iovas;
for (int i = 0; i < info->num_iovas; i++) {
Range *range = g_new(Range, 1);
range_set_bounds(range, iova_ranges[i].start, iova_ranges[i].last);
bcontainer->iova_ranges =
range_list_insert(bcontainer->iova_ranges, range);
}
bcontainer->pgsizes = info->out_iova_alignment;
g_free(info);
return 0;
error:
ret = -errno;
g_free(info);
error_setg_errno(errp, errno, "Cannot get IOVA ranges");
return ret;
}
static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp)
{
VFIOContainerBase *bcontainer;
VFIOIOMMUFDContainer *container;
VFIOAddressSpace *space;
struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
int ret, devfd;
uint32_t ioas_id;
Error *err = NULL;
if (vbasedev->fd < 0) {
devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp);
if (devfd < 0) {
return devfd;
}
vbasedev->fd = devfd;
} else {
devfd = vbasedev->fd;
}
ret = iommufd_cdev_connect_and_bind(vbasedev, errp);
if (ret) {
goto err_connect_bind;
}
space = vfio_get_address_space(as);
/* try to attach to an existing container in this space */
QLIST_FOREACH(bcontainer, &space->containers, next) {
container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
if (bcontainer->ops != &vfio_iommufd_ops ||
vbasedev->iommufd != container->be) {
continue;
}
if (iommufd_cdev_attach_container(vbasedev, container, &err)) {
const char *msg = error_get_pretty(err);
trace_iommufd_cdev_fail_attach_existing_container(msg);
error_free(err);
err = NULL;
} else {
ret = iommufd_cdev_ram_block_discard_disable(true);
if (ret) {
error_setg(errp,
"Cannot set discarding of RAM broken (%d)", ret);
goto err_discard_disable;
}
goto found_container;
}
}
/* Need to allocate a new dedicated container */
ret = iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp);
if (ret < 0) {
goto err_alloc_ioas;
}
trace_iommufd_cdev_alloc_ioas(vbasedev->iommufd->fd, ioas_id);
container = g_malloc0(sizeof(*container));
container->be = vbasedev->iommufd;
container->ioas_id = ioas_id;
bcontainer = &container->bcontainer;
vfio_container_init(bcontainer, space, &vfio_iommufd_ops);
QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
ret = iommufd_cdev_attach_container(vbasedev, container, errp);
if (ret) {
goto err_attach_container;
}
ret = iommufd_cdev_ram_block_discard_disable(true);
if (ret) {
goto err_discard_disable;
}
ret = iommufd_cdev_get_info_iova_range(container, ioas_id, &err);
if (ret) {
error_append_hint(&err,
"Fallback to default 64bit IOVA range and 4K page size\n");
warn_report_err(err);
err = NULL;
bcontainer->pgsizes = qemu_real_host_page_size();
}
bcontainer->listener = vfio_memory_listener;
memory_listener_register(&bcontainer->listener, bcontainer->space->as);
if (bcontainer->error) {
ret = -1;
error_propagate_prepend(errp, bcontainer->error,
"memory listener initialization failed: ");
goto err_listener_register;
}
bcontainer->initialized = true;
found_container:
ret = ioctl(devfd, VFIO_DEVICE_GET_INFO, &dev_info);
if (ret) {
error_setg_errno(errp, errno, "error getting device info");
goto err_listener_register;
}
/*
* TODO: examine RAM_BLOCK_DISCARD stuff, should we do group level
* for discarding incompatibility check as well?
*/
if (vbasedev->ram_block_discard_allowed) {
iommufd_cdev_ram_block_discard_disable(false);
}
vbasedev->group = 0;
vbasedev->num_irqs = dev_info.num_irqs;
vbasedev->num_regions = dev_info.num_regions;
vbasedev->flags = dev_info.flags;
vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
vbasedev->bcontainer = bcontainer;
QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
vbasedev->num_regions, vbasedev->flags);
return 0;
err_listener_register:
iommufd_cdev_ram_block_discard_disable(false);
err_discard_disable:
iommufd_cdev_detach_container(vbasedev, container);
err_attach_container:
iommufd_cdev_container_destroy(container);
err_alloc_ioas:
vfio_put_address_space(space);
iommufd_cdev_unbind_and_disconnect(vbasedev);
err_connect_bind:
close(vbasedev->fd);
return ret;
}
static void iommufd_cdev_detach(VFIODevice *vbasedev)
{
VFIOContainerBase *bcontainer = vbasedev->bcontainer;
VFIOAddressSpace *space = bcontainer->space;
VFIOIOMMUFDContainer *container = container_of(bcontainer,
VFIOIOMMUFDContainer,
bcontainer);
QLIST_REMOVE(vbasedev, global_next);
QLIST_REMOVE(vbasedev, container_next);
vbasedev->bcontainer = NULL;
if (!vbasedev->ram_block_discard_allowed) {
iommufd_cdev_ram_block_discard_disable(false);
}
iommufd_cdev_detach_container(vbasedev, container);
iommufd_cdev_container_destroy(container);
vfio_put_address_space(space);
iommufd_cdev_unbind_and_disconnect(vbasedev);
close(vbasedev->fd);
}
static VFIODevice *iommufd_cdev_pci_find_by_devid(__u32 devid)
{
VFIODevice *vbasedev_iter;
QLIST_FOREACH(vbasedev_iter, &vfio_device_list, global_next) {
if (vbasedev_iter->bcontainer->ops != &vfio_iommufd_ops) {
continue;
}
if (devid == vbasedev_iter->devid) {
return vbasedev_iter;
}
}
return NULL;
}
static VFIOPCIDevice *
iommufd_cdev_dep_get_realized_vpdev(struct vfio_pci_dependent_device *dep_dev,
VFIODevice *reset_dev)
{
VFIODevice *vbasedev_tmp;
if (dep_dev->devid == reset_dev->devid ||
dep_dev->devid == VFIO_PCI_DEVID_OWNED) {
return NULL;
}
vbasedev_tmp = iommufd_cdev_pci_find_by_devid(dep_dev->devid);
if (!vbasedev_tmp || !vbasedev_tmp->dev->realized ||
vbasedev_tmp->type != VFIO_DEVICE_TYPE_PCI) {
return NULL;
}
return container_of(vbasedev_tmp, VFIOPCIDevice, vbasedev);
}
static int iommufd_cdev_pci_hot_reset(VFIODevice *vbasedev, bool single)
{
VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
struct vfio_pci_hot_reset_info *info = NULL;
struct vfio_pci_dependent_device *devices;
struct vfio_pci_hot_reset *reset;
int ret, i;
bool multi = false;
trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
if (!single) {
vfio_pci_pre_reset(vdev);
}
vdev->vbasedev.needs_reset = false;
ret = vfio_pci_get_pci_hot_reset_info(vdev, &info);
if (ret) {
goto out_single;
}
assert(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID);
devices = &info->devices[0];
if (!(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED)) {
if (!vdev->has_pm_reset) {
for (i = 0; i < info->count; i++) {
if (devices[i].devid == VFIO_PCI_DEVID_NOT_OWNED) {
error_report("vfio: Cannot reset device %s, "
"depends on device %04x:%02x:%02x.%x "
"which is not owned.",
vdev->vbasedev.name, devices[i].segment,
devices[i].bus, PCI_SLOT(devices[i].devfn),
PCI_FUNC(devices[i].devfn));
}
}
}
ret = -EPERM;
goto out_single;
}
trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
for (i = 0; i < info->count; i++) {
VFIOPCIDevice *tmp;
trace_iommufd_cdev_pci_hot_reset_dep_devices(devices[i].segment,
devices[i].bus,
PCI_SLOT(devices[i].devfn),
PCI_FUNC(devices[i].devfn),
devices[i].devid);
/*
* If a VFIO cdev device is resettable, all the dependent devices
* are either bound to same iommufd or within same iommu_groups as
* one of the iommufd bound devices.
*/
assert(devices[i].devid != VFIO_PCI_DEVID_NOT_OWNED);
tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev);
if (!tmp) {
continue;
}
if (single) {
ret = -EINVAL;
goto out_single;
}
vfio_pci_pre_reset(tmp);
tmp->vbasedev.needs_reset = false;
multi = true;
}
if (!single && !multi) {
ret = -EINVAL;
goto out_single;
}
/* Use zero length array for hot reset with iommufd backend */
reset = g_malloc0(sizeof(*reset));
reset->argsz = sizeof(*reset);
/* Bus reset! */
ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
g_free(reset);
if (ret) {
ret = -errno;
}
trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
ret ? strerror(errno) : "Success");
/* Re-enable INTx on affected devices */
for (i = 0; i < info->count; i++) {
VFIOPCIDevice *tmp;
tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev);
if (!tmp) {
continue;
}
vfio_pci_post_reset(tmp);
}
out_single:
if (!single) {
vfio_pci_post_reset(vdev);
}
g_free(info);
return ret;
}
const VFIOIOMMUOps vfio_iommufd_ops = {
.dma_map = iommufd_cdev_map,
.dma_unmap = iommufd_cdev_unmap,
.attach_device = iommufd_cdev_attach,
.detach_device = iommufd_cdev_detach,
.pci_hot_reset = iommufd_cdev_pci_hot_reset,
};

View File

@ -2,10 +2,14 @@ vfio_ss = ss.source_set()
vfio_ss.add(files(
'helpers.c',
'common.c',
'container-base.c',
'container.c',
'spapr.c',
'migration.c',
))
vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files(
'iommufd.c',
))
vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
'display.c',
'pci-quirks.c',

View File

@ -19,6 +19,7 @@
*/
#include "qemu/osdep.h"
#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
#include <linux/vfio.h>
#include <sys/ioctl.h>
@ -42,6 +43,7 @@
#include "qapi/error.h"
#include "migration/blocker.h"
#include "migration/qemu-file.h"
#include "sysemu/iommufd.h"
#define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
@ -2374,7 +2376,7 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
return 0;
}
static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
{
PCIDevice *pdev = &vdev->pdev;
uint16_t cmd;
@ -2411,7 +2413,7 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
}
static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
void vfio_pci_post_reset(VFIOPCIDevice *vdev)
{
Error *err = NULL;
int nr;
@ -2435,7 +2437,7 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
vfio_quirk_reset(vdev);
}
static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
{
char tmp[13];
@ -2445,22 +2447,13 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
return (strcmp(tmp, name) == 0);
}
static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
struct vfio_pci_hot_reset_info **info_p)
{
VFIOGroup *group;
struct vfio_pci_hot_reset_info *info;
struct vfio_pci_dependent_device *devices;
struct vfio_pci_hot_reset *reset;
int32_t *fds;
int ret, i, count;
bool multi = false;
int ret, count;
trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
if (!single) {
vfio_pci_pre_reset(vdev);
}
vdev->vbasedev.needs_reset = false;
assert(info_p && !*info_p);
info = g_malloc0(sizeof(*info));
info->argsz = sizeof(*info);
@ -2468,163 +2461,36 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
if (ret && errno != ENOSPC) {
ret = -errno;
g_free(info);
if (!vdev->has_pm_reset) {
error_report("vfio: Cannot reset device %s, "
"no available reset mechanism.", vdev->vbasedev.name);
}
goto out_single;
return ret;
}
count = info->count;
info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
info->argsz = sizeof(*info) + (count * sizeof(*devices));
devices = &info->devices[0];
info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0])));
info->argsz = sizeof(*info) + (count * sizeof(info->devices[0]));
ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
if (ret) {
ret = -errno;
g_free(info);
error_report("vfio: hot reset info failed: %m");
goto out_single;
return ret;
}
trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
*info_p = info;
return 0;
}
/* Verify that we have all the groups required */
for (i = 0; i < info->count; i++) {
PCIHostDeviceAddress host;
VFIOPCIDevice *tmp;
VFIODevice *vbasedev_iter;
static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
{
VFIODevice *vbasedev = &vdev->vbasedev;
const VFIOIOMMUOps *ops = vbasedev->bcontainer->ops;
host.domain = devices[i].segment;
host.bus = devices[i].bus;
host.slot = PCI_SLOT(devices[i].devfn);
host.function = PCI_FUNC(devices[i].devfn);
trace_vfio_pci_hot_reset_dep_devices(host.domain,
host.bus, host.slot, host.function, devices[i].group_id);
if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
continue;
}
QLIST_FOREACH(group, &vfio_group_list, next) {
if (group->groupid == devices[i].group_id) {
break;
}
}
if (!group) {
if (!vdev->has_pm_reset) {
error_report("vfio: Cannot reset device %s, "
"depends on group %d which is not owned.",
vdev->vbasedev.name, devices[i].group_id);
}
ret = -EPERM;
goto out;
}
/* Prep dependent devices for reset and clear our marker. */
QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
if (!vbasedev_iter->dev->realized ||
vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
continue;
}
tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
if (single) {
ret = -EINVAL;
goto out_single;
}
vfio_pci_pre_reset(tmp);
tmp->vbasedev.needs_reset = false;
multi = true;
break;
}
}
}
if (!single && !multi) {
ret = -EINVAL;
goto out_single;
}
/* Determine how many group fds need to be passed */
count = 0;
QLIST_FOREACH(group, &vfio_group_list, next) {
for (i = 0; i < info->count; i++) {
if (group->groupid == devices[i].group_id) {
count++;
break;
}
}
}
reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
fds = &reset->group_fds[0];
/* Fill in group fds */
QLIST_FOREACH(group, &vfio_group_list, next) {
for (i = 0; i < info->count; i++) {
if (group->groupid == devices[i].group_id) {
fds[reset->count++] = group->fd;
break;
}
}
}
/* Bus reset! */
ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
g_free(reset);
trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
ret ? strerror(errno) : "Success");
out:
/* Re-enable INTx on affected devices */
for (i = 0; i < info->count; i++) {
PCIHostDeviceAddress host;
VFIOPCIDevice *tmp;
VFIODevice *vbasedev_iter;
host.domain = devices[i].segment;
host.bus = devices[i].bus;
host.slot = PCI_SLOT(devices[i].devfn);
host.function = PCI_FUNC(devices[i].devfn);
if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
continue;
}
QLIST_FOREACH(group, &vfio_group_list, next) {
if (group->groupid == devices[i].group_id) {
break;
}
}
if (!group) {
break;
}
QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
if (!vbasedev_iter->dev->realized ||
vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
continue;
}
tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
vfio_pci_post_reset(tmp);
break;
}
}
}
out_single:
if (!single) {
vfio_pci_post_reset(vdev);
}
g_free(info);
return ret;
return ops->pci_hot_reset(vbasedev, single);
}
/*
@ -3078,17 +2944,19 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
VFIODevice *vbasedev = &vdev->vbasedev;
char *tmp, *subsys;
Error *err = NULL;
struct stat st;
int i, ret;
bool is_mdev;
char uuid[UUID_STR_LEN];
char *name;
if (!vbasedev->sysfsdev) {
if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
if (!(~vdev->host.domain || ~vdev->host.bus ||
~vdev->host.slot || ~vdev->host.function)) {
error_setg(errp, "No provided host device");
error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
#ifdef CONFIG_IOMMUFD
"or -device vfio-pci,fd=DEVICE_FD "
#endif
"or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
return;
}
@ -3098,17 +2966,10 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
vdev->host.slot, vdev->host.function);
}
if (stat(vbasedev->sysfsdev, &st) < 0) {
error_setg_errno(errp, errno, "no such host device");
error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
if (vfio_device_get_name(vbasedev, errp) < 0) {
return;
}
vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
vbasedev->ops = &vfio_pci_ops;
vbasedev->type = VFIO_DEVICE_TYPE_PCI;
vbasedev->dev = DEVICE(vdev);
/*
* Mediated devices *might* operate compatibly with discarding of RAM, but
* we cannot know for certain, it depends on whether the mdev vendor driver
@ -3456,6 +3317,7 @@ static void vfio_instance_init(Object *obj)
{
PCIDevice *pci_dev = PCI_DEVICE(obj);
VFIOPCIDevice *vdev = VFIO_PCI(obj);
VFIODevice *vbasedev = &vdev->vbasedev;
device_add_bootindex_property(obj, &vdev->bootindex,
"bootindex", NULL,
@ -3465,6 +3327,9 @@ static void vfio_instance_init(Object *obj)
vdev->host.slot = ~0U;
vdev->host.function = ~0U;
vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
DEVICE(vdev), false);
vdev->nv_gpudirect_clique = 0xFF;
/* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
@ -3517,14 +3382,20 @@ static Property vfio_pci_dev_properties[] = {
qdev_prop_nv_gpudirect_clique, uint8_t),
DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
OFF_AUTOPCIBAR_OFF),
/*
* TODO - support passed fds... is this necessary?
* DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
* DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
*/
#ifdef CONFIG_IOMMUFD
DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
#endif
DEFINE_PROP_END_OF_LIST(),
};
#ifdef CONFIG_IOMMUFD
static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
{
vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp);
}
#endif
static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
@ -3532,6 +3403,9 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
dc->reset = vfio_pci_reset;
device_class_set_props(dc, vfio_pci_dev_properties);
#ifdef CONFIG_IOMMUFD
object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
#endif
dc->desc = "VFIO-based PCI device assignment";
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
pdc->realize = vfio_realize;

View File

@ -218,6 +218,12 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr);
extern const PropertyInfo qdev_prop_nv_gpudirect_clique;
void vfio_pci_pre_reset(VFIOPCIDevice *vdev);
void vfio_pci_post_reset(VFIOPCIDevice *vdev);
bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name);
int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
struct vfio_pci_hot_reset_info **info_p);
int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp);
int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,

View File

@ -15,11 +15,13 @@
*/
#include "qemu/osdep.h"
#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
#include "qapi/error.h"
#include <sys/ioctl.h>
#include <linux/vfio.h>
#include "hw/vfio/vfio-platform.h"
#include "sysemu/iommufd.h"
#include "migration/vmstate.h"
#include "qemu/error-report.h"
#include "qemu/lockable.h"
@ -529,14 +531,13 @@ static VFIODeviceOps vfio_platform_ops = {
*/
static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
{
struct stat st;
int ret;
/* @sysfsdev takes precedence over @host */
if (vbasedev->sysfsdev) {
/* @fd takes precedence over @sysfsdev which takes precedence over @host */
if (vbasedev->fd < 0 && vbasedev->sysfsdev) {
g_free(vbasedev->name);
vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
} else {
} else if (vbasedev->fd < 0) {
if (!vbasedev->name || strchr(vbasedev->name, '/')) {
error_setg(errp, "wrong host device name");
return -EINVAL;
@ -546,10 +547,9 @@ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
vbasedev->name);
}
if (stat(vbasedev->sysfsdev, &st) < 0) {
error_setg_errno(errp, errno,
"failed to get the sysfs host device file status");
return -errno;
ret = vfio_device_get_name(vbasedev, errp);
if (ret) {
return ret;
}
ret = vfio_attach_device(vbasedev->name, vbasedev,
@ -581,10 +581,6 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp)
VFIODevice *vbasedev = &vdev->vbasedev;
int i, ret;
vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
vbasedev->dev = dev;
vbasedev->ops = &vfio_platform_ops;
qemu_mutex_init(&vdev->intp_mutex);
trace_vfio_platform_realize(vbasedev->sysfsdev ?
@ -649,9 +645,29 @@ static Property vfio_platform_dev_properties[] = {
DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice,
mmap_timeout, 1100),
DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true),
#ifdef CONFIG_IOMMUFD
DEFINE_PROP_LINK("iommufd", VFIOPlatformDevice, vbasedev.iommufd,
TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
#endif
DEFINE_PROP_END_OF_LIST(),
};
static void vfio_platform_instance_init(Object *obj)
{
VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj);
VFIODevice *vbasedev = &vdev->vbasedev;
vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops,
DEVICE(vdev), false);
}
#ifdef CONFIG_IOMMUFD
static void vfio_platform_set_fd(Object *obj, const char *str, Error **errp)
{
vfio_device_set_fd(&VFIO_PLATFORM_DEVICE(obj)->vbasedev, str, errp);
}
#endif
static void vfio_platform_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
@ -659,6 +675,9 @@ static void vfio_platform_class_init(ObjectClass *klass, void *data)
dc->realize = vfio_platform_realize;
device_class_set_props(dc, vfio_platform_dev_properties);
#ifdef CONFIG_IOMMUFD
object_class_property_add_str(klass, "fd", NULL, vfio_platform_set_fd);
#endif
dc->vmsd = &vfio_platform_vmstate;
dc->desc = "VFIO-based platform device assignment";
sbc->connect_irq_notifier = vfio_start_irqfd_injection;
@ -671,6 +690,7 @@ static const TypeInfo vfio_platform_dev_info = {
.name = TYPE_VFIO_PLATFORM,
.parent = TYPE_SYS_BUS_DEVICE,
.instance_size = sizeof(VFIOPlatformDevice),
.instance_init = vfio_platform_instance_init,
.class_init = vfio_platform_class_init,
.class_size = sizeof(VFIOPlatformDeviceClass),
};

View File

@ -24,6 +24,12 @@
#include "qapi/error.h"
#include "trace.h"
typedef struct VFIOSpaprContainer {
VFIOContainer container;
MemoryListener prereg_listener;
QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
} VFIOSpaprContainer;
static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
{
if (memory_region_is_iommu(section->mr)) {
@ -44,8 +50,10 @@ static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
static void vfio_prereg_listener_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(listener, VFIOContainer,
prereg_listener);
VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
prereg_listener);
VFIOContainer *container = &scontainer->container;
VFIOContainerBase *bcontainer = &container->bcontainer;
const hwaddr gpa = section->offset_within_address_space;
hwaddr end;
int ret;
@ -88,9 +96,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener,
* can gracefully fail. Runtime, there's not much we can do other
* than throw a hardware error.
*/
if (!container->initialized) {
if (!container->error) {
error_setg_errno(&container->error, -ret,
if (!bcontainer->initialized) {
if (!bcontainer->error) {
error_setg_errno(&bcontainer->error, -ret,
"Memory registering failed");
}
} else {
@ -102,8 +110,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener,
static void vfio_prereg_listener_region_del(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(listener, VFIOContainer,
prereg_listener);
VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
prereg_listener);
VFIOContainer *container = &scontainer->container;
const hwaddr gpa = section->offset_within_address_space;
hwaddr end;
int ret;
@ -146,12 +155,12 @@ static const MemoryListener vfio_prereg_listener = {
.region_del = vfio_prereg_listener_region_del,
};
static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova,
static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova,
hwaddr max_iova, uint64_t iova_pgsizes)
{
VFIOHostDMAWindow *hostwin;
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
if (ranges_overlap(hostwin->min_iova,
hostwin->max_iova - hostwin->min_iova + 1,
min_iova,
@ -165,15 +174,15 @@ static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova,
hostwin->min_iova = min_iova;
hostwin->max_iova = max_iova;
hostwin->iova_pgsizes = iova_pgsizes;
QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next);
}
static int vfio_host_win_del(VFIOContainer *container,
static int vfio_host_win_del(VFIOSpaprContainer *scontainer,
hwaddr min_iova, hwaddr max_iova)
{
VFIOHostDMAWindow *hostwin;
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
QLIST_REMOVE(hostwin, hostwin_next);
g_free(hostwin);
@ -184,7 +193,7 @@ static int vfio_host_win_del(VFIOContainer *container,
return -1;
}
static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container,
hwaddr iova, hwaddr end)
{
VFIOHostDMAWindow *hostwin;
@ -226,6 +235,7 @@ static int vfio_spapr_create_window(VFIOContainer *container,
hwaddr *pgsize)
{
int ret = 0;
VFIOContainerBase *bcontainer = &container->bcontainer;
IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
unsigned entries, bits_total, bits_per_level, max_levels;
@ -239,13 +249,13 @@ static int vfio_spapr_create_window(VFIOContainer *container,
if (pagesize > rampagesize) {
pagesize = rampagesize;
}
pgmask = container->pgsizes & (pagesize | (pagesize - 1));
pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1));
pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0;
if (!pagesize) {
error_report("Host doesn't support page size 0x%"PRIx64
", the supported mask is 0x%lx",
memory_region_iommu_get_min_page_size(iommu_mr),
container->pgsizes);
bcontainer->pgsizes);
return -EINVAL;
}
@ -313,10 +323,15 @@ static int vfio_spapr_create_window(VFIOContainer *container,
return 0;
}
int vfio_container_add_section_window(VFIOContainer *container,
MemoryRegionSection *section,
Error **errp)
static int
vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
MemoryRegionSection *section,
Error **errp)
{
VFIOContainer *container = container_of(bcontainer, VFIOContainer,
bcontainer);
VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
container);
VFIOHostDMAWindow *hostwin;
hwaddr pgsize = 0;
int ret;
@ -332,7 +347,7 @@ int vfio_container_add_section_window(VFIOContainer *container,
iova = section->offset_within_address_space;
end = iova + int128_get64(section->size) - 1;
if (!vfio_find_hostwin(container, iova, end)) {
if (!vfio_find_hostwin(scontainer, iova, end)) {
error_setg(errp, "Container %p can't map guest IOVA region"
" 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
iova, end);
@ -346,7 +361,7 @@ int vfio_container_add_section_window(VFIOContainer *container,
}
/* For now intersections are not allowed, we may relax this later */
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
if (ranges_overlap(hostwin->min_iova,
hostwin->max_iova - hostwin->min_iova + 1,
section->offset_within_address_space,
@ -368,7 +383,7 @@ int vfio_container_add_section_window(VFIOContainer *container,
return ret;
}
vfio_host_win_add(container, section->offset_within_address_space,
vfio_host_win_add(scontainer, section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(section->size) - 1, pgsize);
#ifdef CONFIG_KVM
@ -401,16 +416,22 @@ int vfio_container_add_section_window(VFIOContainer *container,
return 0;
}
void vfio_container_del_section_window(VFIOContainer *container,
MemoryRegionSection *section)
static void
vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer,
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(bcontainer, VFIOContainer,
bcontainer);
VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
container);
if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
return;
}
vfio_spapr_remove_window(container,
section->offset_within_address_space);
if (vfio_host_win_del(container,
if (vfio_host_win_del(scontainer,
section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(section->size) - 1) < 0) {
@ -419,13 +440,26 @@ void vfio_container_del_section_window(VFIOContainer *container,
}
}
static VFIOIOMMUOps vfio_iommu_spapr_ops;
static void setup_spapr_ops(VFIOContainerBase *bcontainer)
{
vfio_iommu_spapr_ops = *bcontainer->ops;
vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window;
vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window;
bcontainer->ops = &vfio_iommu_spapr_ops;
}
int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
{
VFIOContainerBase *bcontainer = &container->bcontainer;
VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
container);
struct vfio_iommu_spapr_tce_info info;
bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
int ret, fd = container->fd;
QLIST_INIT(&container->hostwin_list);
QLIST_INIT(&scontainer->hostwin_list);
/*
* The host kernel code implementing VFIO_IOMMU_DISABLE is called
@ -439,13 +473,13 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
return -errno;
}
} else {
container->prereg_listener = vfio_prereg_listener;
scontainer->prereg_listener = vfio_prereg_listener;
memory_listener_register(&container->prereg_listener,
memory_listener_register(&scontainer->prereg_listener,
&address_space_memory);
if (container->error) {
if (bcontainer->error) {
ret = -1;
error_propagate_prepend(errp, container->error,
error_propagate_prepend(errp, bcontainer->error,
"RAM memory listener initialization failed: ");
goto listener_unregister_exit;
}
@ -461,7 +495,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
}
if (v2) {
container->pgsizes = info.ddw.pgsizes;
bcontainer->pgsizes = info.ddw.pgsizes;
/*
* There is a default window in just created container.
* To make region_add/del simpler, we better remove this
@ -476,30 +510,34 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
}
} else {
/* The default table uses 4K pages */
container->pgsizes = 0x1000;
vfio_host_win_add(container, info.dma32_window_start,
bcontainer->pgsizes = 0x1000;
vfio_host_win_add(scontainer, info.dma32_window_start,
info.dma32_window_start +
info.dma32_window_size - 1,
0x1000);
}
setup_spapr_ops(bcontainer);
return 0;
listener_unregister_exit:
if (v2) {
memory_listener_unregister(&container->prereg_listener);
memory_listener_unregister(&scontainer->prereg_listener);
}
return ret;
}
void vfio_spapr_container_deinit(VFIOContainer *container)
{
VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
container);
VFIOHostDMAWindow *hostwin, *next;
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
memory_listener_unregister(&container->prereg_listener);
memory_listener_unregister(&scontainer->prereg_listener);
}
QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next,
next) {
QLIST_REMOVE(hostwin, hostwin_next);
g_free(hostwin);

View File

@ -116,8 +116,8 @@ vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Re
vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x"
vfio_dma_unmap_overflow_workaround(void) ""
vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
vfio_legacy_dma_unmap_overflow_workaround(void) ""
vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
# platform.c
@ -164,3 +164,14 @@ vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcop
vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
vfio_vmstate_change_prepare(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
#iommufd.c
iommufd_cdev_connect_and_bind(int iommufd, const char *name, int devfd, int devid) " [iommufd=%d] Successfully bound device %s (fd=%d): output devid=%d"
iommufd_cdev_getfd(const char *dev, int devfd) " %s (fd=%d)"
iommufd_cdev_attach_ioas_hwpt(int iommufd, const char *name, int devfd, int id) " [iommufd=%d] Successfully attached device %s (%d) to id=%d"
iommufd_cdev_detach_ioas_hwpt(int iommufd, const char *name) " [iommufd=%d] Successfully detached %s"
iommufd_cdev_fail_attach_existing_container(const char *msg) " %s"
iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d"
iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d"
iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d"

View File

@ -30,6 +30,7 @@
#include <linux/vfio.h>
#endif
#include "sysemu/sysemu.h"
#include "hw/vfio/vfio-container-base.h"
#define VFIO_MSG_PREFIX "vfio %s: "
@ -72,54 +73,15 @@ typedef struct VFIOMigration {
bool initial_data_sent;
} VFIOMigration;
typedef struct VFIOAddressSpace {
AddressSpace *as;
QLIST_HEAD(, VFIOContainer) containers;
QLIST_ENTRY(VFIOAddressSpace) list;
} VFIOAddressSpace;
struct VFIOGroup;
typedef struct VFIOContainer {
VFIOAddressSpace *space;
VFIOContainerBase bcontainer;
int fd; /* /dev/vfio/vfio, empowered by the attached groups */
MemoryListener listener;
MemoryListener prereg_listener;
unsigned iommu_type;
Error *error;
bool initialized;
bool dirty_pages_supported;
uint64_t dirty_pgsizes;
uint64_t max_dirty_bitmap_size;
unsigned long pgsizes;
unsigned int dma_max_mappings;
QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
QLIST_HEAD(, VFIOGroup) group_list;
QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
QLIST_ENTRY(VFIOContainer) next;
QLIST_HEAD(, VFIODevice) device_list;
GList *iova_ranges;
} VFIOContainer;
typedef struct VFIOGuestIOMMU {
VFIOContainer *container;
IOMMUMemoryRegion *iommu_mr;
hwaddr iommu_offset;
IOMMUNotifier n;
QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
} VFIOGuestIOMMU;
typedef struct VFIORamDiscardListener {
VFIOContainer *container;
MemoryRegion *mr;
hwaddr offset_within_address_space;
hwaddr size;
uint64_t granularity;
RamDiscardListener listener;
QLIST_ENTRY(VFIORamDiscardListener) next;
} VFIORamDiscardListener;
typedef struct VFIOHostDMAWindow {
hwaddr min_iova;
hwaddr max_iova;
@ -127,6 +89,14 @@ typedef struct VFIOHostDMAWindow {
QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next;
} VFIOHostDMAWindow;
typedef struct IOMMUFDBackend IOMMUFDBackend;
typedef struct VFIOIOMMUFDContainer {
VFIOContainerBase bcontainer;
IOMMUFDBackend *be;
uint32_t ioas_id;
} VFIOIOMMUFDContainer;
typedef struct VFIODeviceOps VFIODeviceOps;
typedef struct VFIODevice {
@ -134,7 +104,7 @@ typedef struct VFIODevice {
QLIST_ENTRY(VFIODevice) container_next;
QLIST_ENTRY(VFIODevice) global_next;
struct VFIOGroup *group;
VFIOContainer *container;
VFIOContainerBase *bcontainer;
char *sysfsdev;
char *name;
DeviceState *dev;
@ -154,6 +124,8 @@ typedef struct VFIODevice {
OnOffAuto pre_copy_dirty_page_tracking;
bool dirty_pages_supported;
bool dirty_tracking;
int devid;
IOMMUFDBackend *iommufd;
} VFIODevice;
struct VFIODeviceOps {
@ -201,31 +173,10 @@ typedef struct VFIODisplay {
} dmabuf;
} VFIODisplay;
typedef struct {
unsigned long *bitmap;
hwaddr size;
hwaddr pages;
} VFIOBitmap;
VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
void vfio_put_address_space(VFIOAddressSpace *space);
bool vfio_devices_all_running_and_saving(VFIOContainer *container);
/* container->fd */
int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
ram_addr_t size, IOMMUTLBEntry *iotlb);
int vfio_dma_map(VFIOContainer *container, hwaddr iova,
ram_addr_t size, void *vaddr, bool readonly);
int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start);
int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
hwaddr iova, hwaddr size);
/* SPAPR specific */
int vfio_container_add_section_window(VFIOContainer *container,
MemoryRegionSection *section,
Error **errp);
void vfio_container_del_section_window(VFIOContainer *container,
MemoryRegionSection *section);
int vfio_spapr_container_init(VFIOContainer *container, Error **errp);
void vfio_spapr_container_deinit(VFIOContainer *container);
@ -259,7 +210,8 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
extern VFIOGroupList vfio_group_list;
extern VFIODeviceList vfio_device_list;
extern const VFIOIOMMUOps vfio_legacy_ops;
extern const VFIOIOMMUOps vfio_iommufd_ops;
extern const MemoryListener vfio_memory_listener;
extern int vfio_kvm_device_fd;
@ -292,11 +244,19 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp);
void vfio_migration_exit(VFIODevice *vbasedev);
int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size);
bool vfio_devices_all_running_and_mig_active(VFIOContainer *container);
bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container);
int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
bool
vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer);
bool
vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer);
int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap, hwaddr iova,
hwaddr size);
int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
uint64_t size, ram_addr_t ram_addr);
int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
uint64_t size, ram_addr_t ram_addr);
/* Returns 0 on success, or a negative errno. */
int vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
DeviceState *dev, bool ram_discard);
#endif /* HW_VFIO_VFIO_COMMON_H */

View File

@ -0,0 +1,121 @@
/*
* VFIO BASE CONTAINER
*
* Copyright (C) 2023 Intel Corporation.
* Copyright Red Hat, Inc. 2023
*
* Authors: Yi Liu <yi.l.liu@intel.com>
* Eric Auger <eric.auger@redhat.com>
*
* SPDX-License-Identifier: GPL-2.0-or-later
*/
#ifndef HW_VFIO_VFIO_CONTAINER_BASE_H
#define HW_VFIO_VFIO_CONTAINER_BASE_H
#include "exec/memory.h"
typedef struct VFIODevice VFIODevice;
typedef struct VFIOIOMMUOps VFIOIOMMUOps;
typedef struct {
unsigned long *bitmap;
hwaddr size;
hwaddr pages;
} VFIOBitmap;
typedef struct VFIOAddressSpace {
AddressSpace *as;
QLIST_HEAD(, VFIOContainerBase) containers;
QLIST_ENTRY(VFIOAddressSpace) list;
} VFIOAddressSpace;
/*
* This is the base object for vfio container backends
*/
typedef struct VFIOContainerBase {
const VFIOIOMMUOps *ops;
VFIOAddressSpace *space;
MemoryListener listener;
Error *error;
bool initialized;
uint64_t dirty_pgsizes;
uint64_t max_dirty_bitmap_size;
unsigned long pgsizes;
unsigned int dma_max_mappings;
bool dirty_pages_supported;
QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
QLIST_ENTRY(VFIOContainerBase) next;
QLIST_HEAD(, VFIODevice) device_list;
GList *iova_ranges;
} VFIOContainerBase;
typedef struct VFIOGuestIOMMU {
VFIOContainerBase *bcontainer;
IOMMUMemoryRegion *iommu_mr;
hwaddr iommu_offset;
IOMMUNotifier n;
QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
} VFIOGuestIOMMU;
typedef struct VFIORamDiscardListener {
VFIOContainerBase *bcontainer;
MemoryRegion *mr;
hwaddr offset_within_address_space;
hwaddr size;
uint64_t granularity;
RamDiscardListener listener;
QLIST_ENTRY(VFIORamDiscardListener) next;
} VFIORamDiscardListener;
int vfio_container_dma_map(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly);
int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb);
int vfio_container_add_section_window(VFIOContainerBase *bcontainer,
MemoryRegionSection *section,
Error **errp);
void vfio_container_del_section_window(VFIOContainerBase *bcontainer,
MemoryRegionSection *section);
int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
bool start);
int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap,
hwaddr iova, hwaddr size);
void vfio_container_init(VFIOContainerBase *bcontainer,
VFIOAddressSpace *space,
const VFIOIOMMUOps *ops);
void vfio_container_destroy(VFIOContainerBase *bcontainer);
struct VFIOIOMMUOps {
/* basic feature */
int (*dma_map)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly);
int (*dma_unmap)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb);
int (*attach_device)(const char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp);
void (*detach_device)(VFIODevice *vbasedev);
/* migration feature */
int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
bool start);
int (*query_dirty_bitmap)(const VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap,
hwaddr iova, hwaddr size);
/* PCI specific */
int (*pci_hot_reset)(VFIODevice *vbasedev, bool single);
/* SPAPR specific */
int (*add_window)(VFIOContainerBase *bcontainer,
MemoryRegionSection *section,
Error **errp);
void (*del_window)(VFIOContainerBase *bcontainer,
MemoryRegionSection *section);
};
#endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */

View File

@ -0,0 +1,16 @@
/*
* QEMU Chardev Helper
*
* Copyright (C) 2023 Intel Corporation.
*
* Authors: Yi Liu <yi.l.liu@intel.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*/
#ifndef QEMU_CHARDEV_OPEN_H
#define QEMU_CHARDEV_OPEN_H
int open_cdev(const char *devpath, dev_t cdev);
#endif

38
include/sysemu/iommufd.h Normal file
View File

@ -0,0 +1,38 @@
#ifndef SYSEMU_IOMMUFD_H
#define SYSEMU_IOMMUFD_H
#include "qom/object.h"
#include "qemu/thread.h"
#include "exec/hwaddr.h"
#include "exec/cpu-common.h"
#define TYPE_IOMMUFD_BACKEND "iommufd"
OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)
struct IOMMUFDBackendClass {
ObjectClass parent_class;
};
struct IOMMUFDBackend {
Object parent;
/*< protected >*/
int fd; /* /dev/iommu file descriptor */
bool owned; /* is the /dev/iommu opened internally */
QemuMutex lock;
uint32_t users;
/*< public >*/
};
int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
void iommufd_backend_disconnect(IOMMUFDBackend *be);
int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
Error **errp);
void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
ram_addr_t size, void *vaddr, bool readonly);
int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
hwaddr iova, ram_addr_t size);
#endif

View File

@ -794,6 +794,23 @@
{ 'struct': 'VfioUserServerProperties',
'data': { 'socket': 'SocketAddress', 'device': 'str' } }
##
# @IOMMUFDProperties:
#
# Properties for iommufd objects.
#
# @fd: file descriptor name previously passed via 'getfd' command,
# which represents a pre-opened /dev/iommu. This allows the
# iommufd object to be shared accross several subsystems
# (VFIO, VDPA, ...), and the file descriptor to be shared
# with other process, e.g. DPDK. (default: QEMU opens
# /dev/iommu by itself)
#
# Since: 9.0
##
{ 'struct': 'IOMMUFDProperties',
'data': { '*fd': 'str' } }
##
# @RngProperties:
#
@ -934,6 +951,7 @@
'input-barrier',
{ 'name': 'input-linux',
'if': 'CONFIG_LINUX' },
'iommufd',
'iothread',
'main-loop',
{ 'name': 'memory-backend-epc',
@ -1003,6 +1021,7 @@
'input-barrier': 'InputBarrierProperties',
'input-linux': { 'type': 'InputLinuxProperties',
'if': 'CONFIG_LINUX' },
'iommufd': 'IOMMUFDProperties',
'iothread': 'IothreadProperties',
'main-loop': 'MainLoopProperties',
'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties',

View File

@ -5224,6 +5224,18 @@ SRST
The ``share`` boolean option is on by default with memfd.
``-object iommufd,id=id[,fd=fd]``
Creates an iommufd backend which allows control of DMA mapping
through the ``/dev/iommu`` device.
The ``id`` parameter is a unique ID which frontends (such as
vfio-pci of vdpa) will use to connect with the iommufd backend.
The ``fd`` parameter is an optional pre-opened file descriptor
resulting from ``/dev/iommu`` opening. Usually the iommufd is shared
across all subsystems, bringing the benefit of centralized
reference counting.
``-object rng-builtin,id=id``
Creates a random number generator backend which obtains entropy
from QEMU builtin functions. The ``id`` parameter is a unique ID

81
util/chardev_open.c Normal file
View File

@ -0,0 +1,81 @@
/*
* Copyright (c) 2019, Mellanox Technologies. All rights reserved.
* Copyright (C) 2023 Intel Corporation.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Authors: Yi Liu <yi.l.liu@intel.com>
*
* Copied from
* https://github.com/linux-rdma/rdma-core/blob/master/util/open_cdev.c
*
*/
#include "qemu/osdep.h"
#include "qemu/chardev_open.h"
static int open_cdev_internal(const char *path, dev_t cdev)
{
struct stat st;
int fd;
fd = qemu_open_old(path, O_RDWR);
if (fd == -1) {
return -1;
}
if (fstat(fd, &st) || !S_ISCHR(st.st_mode) ||
(cdev != 0 && st.st_rdev != cdev)) {
close(fd);
return -1;
}
return fd;
}
static int open_cdev_robust(dev_t cdev)
{
g_autofree char *devpath = NULL;
/*
* This assumes that udev is being used and is creating the /dev/char/
* symlinks.
*/
devpath = g_strdup_printf("/dev/char/%u:%u", major(cdev), minor(cdev));
return open_cdev_internal(devpath, cdev);
}
int open_cdev(const char *devpath, dev_t cdev)
{
int fd;
fd = open_cdev_internal(devpath, cdev);
if (fd == -1 && cdev != 0) {
return open_cdev_robust(cdev);
}
return fd;
}

View File

@ -108,6 +108,7 @@ if have_block
util_ss.add(files('filemonitor-stub.c'))
endif
util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
util_ss.add(when: 'CONFIG_LINUX', if_true: files('chardev_open.c'))
endif
if cpu == 'aarch64'