diff --git a/MAINTAINERS b/MAINTAINERS index fbc0662627..dff0200f70 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2545,6 +2545,13 @@ F: qapi/block*.json F: qapi/transaction.json T: git https://repo.or.cz/qemu/armbru.git block-next +Compute Express Link +M: Ben Widawsky +M: Jonathan Cameron +S: Supported +F: hw/cxl/ +F: include/hw/cxl/ + Dirty Bitmaps M: Eric Blake M: Vladimir Sementsov-Ogievskiy diff --git a/contrib/vhost-user-scsi/vhost-user-scsi.c b/contrib/vhost-user-scsi/vhost-user-scsi.c index b2c0f98253..9ef61cf5a7 100644 --- a/contrib/vhost-user-scsi/vhost-user-scsi.c +++ b/contrib/vhost-user-scsi/vhost-user-scsi.c @@ -433,13 +433,16 @@ out: if (vdev_scsi) { g_main_loop_unref(vdev_scsi->loop); g_free(vdev_scsi); - unlink(opt_socket_path); } if (csock >= 0) { close(csock); } if (lsock >= 0) { close(lsock); + + if (opt_socket_path) { + unlink(opt_socket_path); + } } g_free(opt_socket_path); g_free(iscsi_uri); diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst index a50889c556..e1a93df263 100644 --- a/docs/devel/index-internals.rst +++ b/docs/devel/index-internals.rst @@ -18,3 +18,4 @@ Details about QEMU's various subsystems including how to add features to them. tracing vfio-migration writing-monitor-commands + virtio-backends diff --git a/docs/devel/virtio-backends.rst b/docs/devel/virtio-backends.rst new file mode 100644 index 0000000000..9ff092e7a0 --- /dev/null +++ b/docs/devel/virtio-backends.rst @@ -0,0 +1,214 @@ +.. + Copyright (c) 2022, Linaro Limited + Written by Alex BennĂ©e + +Writing VirtIO backends for QEMU +================================ + +This document attempts to outline the information a developer needs to +know to write device emulations in QEMU. It is specifically focused on +implementing VirtIO devices. For VirtIO the frontend is the driver +running on the guest. The backend is the everything that QEMU needs to +do to handle the emulation of the VirtIO device. This can be done +entirely in QEMU, divided between QEMU and the kernel (vhost) or +handled by a separate process which is configured by QEMU +(vhost-user). + +VirtIO Transports +----------------- + +VirtIO supports a number of different transports. While the details of +the configuration and operation of the device will generally be the +same QEMU represents them as different devices depending on the +transport they use. For example -device virtio-foo represents the foo +device using mmio and -device virtio-foo-pci is the same class of +device using the PCI transport. + +Using the QEMU Object Model (QOM) +--------------------------------- + +Generally all devices in QEMU are super classes of ``TYPE_DEVICE`` +however VirtIO devices should be based on ``TYPE_VIRTIO_DEVICE`` which +itself is derived from the base class. For example: + +.. code:: c + + static const TypeInfo virtio_blk_info = { + .name = TYPE_VIRTIO_BLK, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VirtIOBlock), + .instance_init = virtio_blk_instance_init, + .class_init = virtio_blk_class_init, + }; + +The author may decide to have a more expansive class hierarchy to +support multiple device types. For example the Virtio GPU device: + +.. code:: c + + static const TypeInfo virtio_gpu_base_info = { + .name = TYPE_VIRTIO_GPU_BASE, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VirtIOGPUBase), + .class_size = sizeof(VirtIOGPUBaseClass), + .class_init = virtio_gpu_base_class_init, + .abstract = true + }; + + static const TypeInfo vhost_user_gpu_info = { + .name = TYPE_VHOST_USER_GPU, + .parent = TYPE_VIRTIO_GPU_BASE, + .instance_size = sizeof(VhostUserGPU), + .instance_init = vhost_user_gpu_instance_init, + .instance_finalize = vhost_user_gpu_instance_finalize, + .class_init = vhost_user_gpu_class_init, + }; + + static const TypeInfo virtio_gpu_info = { + .name = TYPE_VIRTIO_GPU, + .parent = TYPE_VIRTIO_GPU_BASE, + .instance_size = sizeof(VirtIOGPU), + .class_size = sizeof(VirtIOGPUClass), + .class_init = virtio_gpu_class_init, + }; + +defines a base class for the VirtIO GPU and then specialises two +versions, one for the internal implementation and the other for the +vhost-user version. + +VirtIOPCIProxy +^^^^^^^^^^^^^^ + +[AJB: the following is supposition and welcomes more informed +opinions] + +Probably due to legacy from the pre-QOM days PCI VirtIO devices don't +follow the normal hierarchy. Instead the a standalone object is based +on the VirtIOPCIProxy class and the specific VirtIO instance is +manually instantiated: + +.. code:: c + + /* + * virtio-blk-pci: This extends VirtioPCIProxy. + */ + #define TYPE_VIRTIO_BLK_PCI "virtio-blk-pci-base" + DECLARE_INSTANCE_CHECKER(VirtIOBlkPCI, VIRTIO_BLK_PCI, + TYPE_VIRTIO_BLK_PCI) + + struct VirtIOBlkPCI { + VirtIOPCIProxy parent_obj; + VirtIOBlock vdev; + }; + + static Property virtio_blk_pci_properties[] = { + DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0), + DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true), + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, + DEV_NVECTORS_UNSPECIFIED), + DEFINE_PROP_END_OF_LIST(), + }; + + static void virtio_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) + { + VirtIOBlkPCI *dev = VIRTIO_BLK_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + + ... + + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); + } + + static void virtio_blk_pci_class_init(ObjectClass *klass, void *data) + { + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + device_class_set_props(dc, virtio_blk_pci_properties); + k->realize = virtio_blk_pci_realize; + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BLOCK; + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; + pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI; + } + + static void virtio_blk_pci_instance_init(Object *obj) + { + VirtIOBlkPCI *dev = VIRTIO_BLK_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_BLK); + object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev), + "bootindex"); + } + + static const VirtioPCIDeviceTypeInfo virtio_blk_pci_info = { + .base_name = TYPE_VIRTIO_BLK_PCI, + .generic_name = "virtio-blk-pci", + .transitional_name = "virtio-blk-pci-transitional", + .non_transitional_name = "virtio-blk-pci-non-transitional", + .instance_size = sizeof(VirtIOBlkPCI), + .instance_init = virtio_blk_pci_instance_init, + .class_init = virtio_blk_pci_class_init, + }; + +Here you can see the instance_init has to manually instantiate the +underlying ``TYPE_VIRTIO_BLOCK`` object and link an alias for one of +it's properties to the PCI device. + + +Back End Implementations +------------------------ + +There are a number of places where the implementation of the backend +can be done: + +* in QEMU itself +* in the host kernel (a.k.a vhost) +* in a separate process (a.k.a. vhost-user) + +vhost_ops vs TYPE_VHOST_USER_BACKEND +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There are two choices to how to implement vhost code. Most of the code +which has to work with either vhost or vhost-user uses +``vhost_dev_init()`` to instantiate the appropriate backend. This +means including a ``struct vhost_dev`` in the main object structure. + +For vhost-user devices you also need to add code to track the +initialisation of the ``chardev`` device used for the control socket +between QEMU and the external vhost-user process. + +If you only need to implement a vhost-user backed the other option is +a use a QOM-ified version of vhost-user. + +.. code:: c + + static void + vhost_user_gpu_instance_init(Object *obj) + { + VhostUserGPU *g = VHOST_USER_GPU(obj); + + g->vhost = VHOST_USER_BACKEND(object_new(TYPE_VHOST_USER_BACKEND)); + object_property_add_alias(obj, "chardev", + OBJECT(g->vhost), "chardev"); + } + + static const TypeInfo vhost_user_gpu_info = { + .name = TYPE_VHOST_USER_GPU, + .parent = TYPE_VIRTIO_GPU_BASE, + .instance_size = sizeof(VhostUserGPU), + .instance_init = vhost_user_gpu_instance_init, + .instance_finalize = vhost_user_gpu_instance_finalize, + .class_init = vhost_user_gpu_class_init, + }; + +Using it this way entails adding a ``struct VhostUserBackend`` to your +core object structure and manually instantiating the backend. This +sub-structure tracks both the ``vhost_dev`` and ``CharDev`` types +needed for the connection. Instead of calling ``vhost_dev_init`` you +would call ``vhost_user_backend_dev_init`` which does what is needed +on your behalf. diff --git a/docs/interop/vhost-user-gpu.rst b/docs/interop/vhost-user-gpu.rst index 71a2c52b31..1640553729 100644 --- a/docs/interop/vhost-user-gpu.rst +++ b/docs/interop/vhost-user-gpu.rst @@ -13,10 +13,10 @@ Introduction ============ The vhost-user-gpu protocol is aiming at sharing the rendering result -of a virtio-gpu, done from a vhost-user slave process to a vhost-user -master process (such as QEMU). It bears a resemblance to a display +of a virtio-gpu, done from a vhost-user back-end process to a vhost-user +front-end process (such as QEMU). It bears a resemblance to a display server protocol, if you consider QEMU as the display server and the -slave as the client, but in a very limited way. Typically, it will +back-end as the client, but in a very limited way. Typically, it will work by setting a scanout/display configuration, before sending flush events for the display updates. It will also update the cursor shape and position. @@ -26,8 +26,8 @@ socket ancillary data to share opened file descriptors (DMABUF fds or shared memory). The socket is usually obtained via ``VHOST_USER_GPU_SET_SOCKET``. -Requests are sent by the *slave*, and the optional replies by the -*master*. +Requests are sent by the *back-end*, and the optional replies by the +*front-end*. Wire format =========== diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst index f9e721ba5f..a99ba4433c 100644 --- a/docs/interop/vhost-user.rst +++ b/docs/interop/vhost-user.rst @@ -23,19 +23,19 @@ space process on the same host. It uses communication over a Unix domain socket to share file descriptors in the ancillary data of the message. -The protocol defines 2 sides of the communication, *master* and -*slave*. *Master* is the application that shares its virtqueues, in -our case QEMU. *Slave* is the consumer of the virtqueues. +The protocol defines 2 sides of the communication, *front-end* and +*back-end*. The *front-end* is the application that shares its virtqueues, in +our case QEMU. The *back-end* is the consumer of the virtqueues. -In the current implementation QEMU is the *master*, and the *slave* is -the external process consuming the virtio queues, for example a +In the current implementation QEMU is the *front-end*, and the *back-end* +is the external process consuming the virtio queues, for example a software Ethernet switch running in user space, such as Snabbswitch, -or a block device backend processing read & write to a virtual -disk. In order to facilitate interoperability between various backend +or a block device back-end processing read & write to a virtual +disk. In order to facilitate interoperability between various back-end implementations, it is recommended to follow the :ref:`Backend program conventions `. -*Master* and *slave* can be either a client (i.e. connecting) or +The *front-end* and *back-end* can be either a client (i.e. connecting) or server (listening) in the socket communication. Support for platforms other than Linux @@ -77,7 +77,7 @@ Header :flags: 32-bit bit field - Lower 2 bits are the version (currently 0x01) -- Bit 2 is the reply flag - needs to be sent on each reply from the slave +- Bit 2 is the reply flag - needs to be sent on each reply from the back-end - Bit 3 is the need_reply flag - see :ref:`REPLY_ACK ` for details. @@ -222,8 +222,8 @@ Virtio device config space :size: a 32-bit configuration space access size in bytes :flags: a 32-bit value: - - 0: Vhost master messages used for writeable fields - - 1: Vhost master messages used for live migration + - 0: Vhost front-end messages used for writeable fields + - 1: Vhost front-end messages used for live migration :payload: Size bytes array holding the contents of the virtio device's configuration space @@ -290,8 +290,8 @@ vhost for the Linux Kernel. Most messages that can be sent via the Unix domain socket implementing vhost-user have an equivalent ioctl to the kernel implementation. -The communication consists of *master* sending message requests and -*slave* sending message replies. Most of the requests don't require +The communication consists of the *front-end* sending message requests and +the *back-end* sending message replies. Most of the requests don't require replies. Here is a list of the ones that do: * ``VHOST_USER_GET_FEATURES`` @@ -305,7 +305,7 @@ replies. Here is a list of the ones that do: :ref:`REPLY_ACK ` The section on ``REPLY_ACK`` protocol extension. -There are several messages that the master sends with file descriptors passed +There are several messages that the front-end sends with file descriptors passed in the ancillary data: * ``VHOST_USER_ADD_MEM_REG`` @@ -318,100 +318,108 @@ in the ancillary data: * ``VHOST_USER_SET_SLAVE_REQ_FD`` * ``VHOST_USER_SET_INFLIGHT_FD`` (if ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD``) -If *master* is unable to send the full message or receives a wrong +If *front-end* is unable to send the full message or receives a wrong reply it will close the connection. An optional reconnection mechanism can be implemented. -If *slave* detects some error such as incompatible features, it may also +If *back-end* detects some error such as incompatible features, it may also close the connection. This should only happen in exceptional circumstances. Any protocol extensions are gated by protocol feature bits, which -allows full backwards compatibility on both master and slave. As -older slaves don't support negotiating protocol features, a feature +allows full backwards compatibility on both front-end and back-end. As +older back-ends don't support negotiating protocol features, a feature bit was dedicated for this purpose:: #define VHOST_USER_F_PROTOCOL_FEATURES 30 -Starting and stopping rings ---------------------------- +Note that VHOST_USER_F_PROTOCOL_FEATURES is the UNUSED (30) feature +bit defined in `VIRTIO 1.1 6.3 Legacy Interface: Reserved Feature Bits +`_. +VIRTIO devices do not advertise this feature bit and therefore VIRTIO +drivers cannot negotiate it. -Client must only process each ring when it is started. +This reserved feature bit was reused by the vhost-user protocol to add +vhost-user protocol feature negotiation in a backwards compatible +fashion. Old vhost-user front-end and back-end implementations continue to +work even though they are not aware of vhost-user protocol feature +negotiation. -Client must only pass data between the ring and the backend, when the -ring is enabled. +Ring states +----------- -If ring is started but disabled, client must process the ring without -talking to the backend. +Rings can be in one of three states: -For example, for a networking device, in the disabled state client -must not supply any new RX packets, but must process and discard any -TX packets. +* stopped: the back-end must not process the ring at all. + +* started but disabled: the back-end must process the ring without + causing any side effects. For example, for a networking device, + in the disabled state the back-end must not supply any new RX packets, + but must process and discard any TX packets. + +* started and enabled. + +Each ring is initialized in a stopped state. The back-end must start +ring upon receiving a kick (that is, detecting that file descriptor is +readable) on the descriptor specified by ``VHOST_USER_SET_VRING_KICK`` +or receiving the in-band message ``VHOST_USER_VRING_KICK`` if negotiated, +and stop ring upon receiving ``VHOST_USER_GET_VRING_BASE``. + +Rings can be enabled or disabled by ``VHOST_USER_SET_VRING_ENABLE``. If ``VHOST_USER_F_PROTOCOL_FEATURES`` has not been negotiated, the -ring is initialized in an enabled state. +ring starts directly in the enabled state. If ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated, the ring is -initialized in a disabled state. Client must not pass data to/from the -backend until ring is enabled by ``VHOST_USER_SET_VRING_ENABLE`` with -parameter 1, or after it has been disabled by -``VHOST_USER_SET_VRING_ENABLE`` with parameter 0. +initialized in a disabled state and is enabled by +``VHOST_USER_SET_VRING_ENABLE`` with parameter 1. -Each ring is initialized in a stopped state, client must not process -it until ring is started, or after it has been stopped. - -Client must start ring upon receiving a kick (that is, detecting that -file descriptor is readable) on the descriptor specified by -``VHOST_USER_SET_VRING_KICK`` or receiving the in-band message -``VHOST_USER_VRING_KICK`` if negotiated, and stop ring upon receiving -``VHOST_USER_GET_VRING_BASE``. - -While processing the rings (whether they are enabled or not), client +While processing the rings (whether they are enabled or not), the back-end must support changing some configuration aspects on the fly. Multiple queue support ---------------------- -Many devices have a fixed number of virtqueues. In this case the master +Many devices have a fixed number of virtqueues. In this case the front-end already knows the number of available virtqueues without communicating with the -slave. +back-end. Some devices do not have a fixed number of virtqueues. Instead the maximum -number of virtqueues is chosen by the slave. The number can depend on host -resource availability or slave implementation details. Such devices are called +number of virtqueues is chosen by the back-end. The number can depend on host +resource availability or back-end implementation details. Such devices are called multiple queue devices. -Multiple queue support allows the slave to advertise the maximum number of -queues. This is treated as a protocol extension, hence the slave has to +Multiple queue support allows the back-end to advertise the maximum number of +queues. This is treated as a protocol extension, hence the back-end has to implement protocol features first. The multiple queues feature is supported only when the protocol feature ``VHOST_USER_PROTOCOL_F_MQ`` (bit 0) is set. -The max number of queues the slave supports can be queried with message -``VHOST_USER_GET_QUEUE_NUM``. Master should stop when the number of requested +The max number of queues the back-end supports can be queried with message +``VHOST_USER_GET_QUEUE_NUM``. Front-end should stop when the number of requested queues is bigger than that. -As all queues share one connection, the master uses a unique index for each +As all queues share one connection, the front-end uses a unique index for each queue in the sent message to identify a specified queue. -The master enables queues by sending message ``VHOST_USER_SET_VRING_ENABLE``. +The front-end enables queues by sending message ``VHOST_USER_SET_VRING_ENABLE``. vhost-user-net has historically automatically enabled the first queue pair. -Slaves should always implement the ``VHOST_USER_PROTOCOL_F_MQ`` protocol +Back-ends should always implement the ``VHOST_USER_PROTOCOL_F_MQ`` protocol feature, even for devices with a fixed number of virtqueues, since it is simple to implement and offers a degree of introspection. -Masters must not rely on the ``VHOST_USER_PROTOCOL_F_MQ`` protocol feature for +Front-ends must not rely on the ``VHOST_USER_PROTOCOL_F_MQ`` protocol feature for devices with a fixed number of virtqueues. Only true multiqueue devices require this protocol feature. Migration --------- -During live migration, the master may need to track the modifications -the slave makes to the memory mapped regions. The client should mark +During live migration, the front-end may need to track the modifications +the back-end makes to the memory mapped regions. The front-end should mark the dirty pages in a log. Once it complies to this logging, it may declare the ``VHOST_F_LOG_ALL`` vhost feature. -To start/stop logging of data/used ring writes, server may send +To start/stop logging of data/used ring writes, the front-end may send messages ``VHOST_USER_SET_FEATURES`` with ``VHOST_F_LOG_ALL`` and ``VHOST_USER_SET_VRING_ADDR`` with ``VHOST_VRING_F_LOG`` in ring's flags set to 1/0, respectively. @@ -425,7 +433,7 @@ Dirty pages are of size:: #define VHOST_LOG_PAGE 0x1000 The log memory fd is provided in the ancillary data of -``VHOST_USER_SET_LOG_BASE`` message when the slave has +``VHOST_USER_SET_LOG_BASE`` message when the back-end has ``VHOST_USER_PROTOCOL_F_LOG_SHMFD`` protocol feature. The size of the log is supplied as part of ``VhostUserMsg`` which @@ -451,26 +459,26 @@ the bit offset of the last byte of the ring must fall within the size supplied by ``VhostUserLog``. ``VHOST_USER_SET_LOG_FD`` is an optional message with an eventfd in -ancillary data, it may be used to inform the master that the log has +ancillary data, it may be used to inform the front-end that the log has been modified. Once the source has finished migration, rings will be stopped by the source. No further update must be done before rings are restarted. -In postcopy migration the slave is started before all the memory has +In postcopy migration the back-end is started before all the memory has been received from the source host, and care must be taken to avoid -accessing pages that have yet to be received. The slave opens a +accessing pages that have yet to be received. The back-end opens a 'userfault'-fd and registers the memory with it; this fd is then -passed back over to the master. The master services requests on the +passed back over to the front-end. The front-end services requests on the userfaultfd for pages that are accessed and when the page is available it performs WAKE ioctl's on the userfaultfd to wake the stalled -slave. The client indicates support for this via the +back-end. The front-end indicates support for this via the ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` feature. Memory access ------------- -The master sends a list of vhost memory regions to the slave using the +The front-end sends a list of vhost memory regions to the back-end using the ``VHOST_USER_SET_MEM_TABLE`` message. Each region has two base addresses: a guest address and a user address. @@ -495,60 +503,60 @@ IOMMU support ------------- When the ``VIRTIO_F_IOMMU_PLATFORM`` feature has been negotiated, the -master sends IOTLB entries update & invalidation by sending -``VHOST_USER_IOTLB_MSG`` requests to the slave with a ``struct +front-end sends IOTLB entries update & invalidation by sending +``VHOST_USER_IOTLB_MSG`` requests to the back-end with a ``struct vhost_iotlb_msg`` as payload. For update events, the ``iotlb`` payload has to be filled with the update message type (2), the I/O virtual address, the size, the user virtual address, and the permissions flags. Addresses and size must be within vhost memory regions set via the ``VHOST_USER_SET_MEM_TABLE`` request. For invalidation events, the ``iotlb`` payload has to be filled with the invalidation message type -(3), the I/O virtual address and the size. On success, the slave is +(3), the I/O virtual address and the size. On success, the back-end is expected to reply with a zero payload, non-zero otherwise. -The slave relies on the slave communication channel (see :ref:`Slave -communication ` section below) to send IOTLB miss +The back-end relies on the back-end communication channel (see :ref:`Back-end +communication ` section below) to send IOTLB miss and access failure events, by sending ``VHOST_USER_SLAVE_IOTLB_MSG`` -requests to the master with a ``struct vhost_iotlb_msg`` as +requests to the front-end with a ``struct vhost_iotlb_msg`` as payload. For miss events, the iotlb payload has to be filled with the miss message type (1), the I/O virtual address and the permissions flags. For access failure event, the iotlb payload has to be filled with the access failure message type (4), the I/O virtual address and -the permissions flags. For synchronization purpose, the slave may -rely on the reply-ack feature, so the master may send a reply when +the permissions flags. For synchronization purpose, the back-end may +rely on the reply-ack feature, so the front-end may send a reply when operation is completed if the reply-ack feature is negotiated and -slaves requests a reply. For miss events, completed operation means -either master sent an update message containing the IOTLB entry -containing requested address and permission, or master sent nothing if +back-ends requests a reply. For miss events, completed operation means +either front-end sent an update message containing the IOTLB entry +containing requested address and permission, or front-end sent nothing if the IOTLB miss message is invalid (invalid IOVA or permission). -The master isn't expected to take the initiative to send IOTLB update -messages, as the slave sends IOTLB miss messages for the guest virtual +The front-end isn't expected to take the initiative to send IOTLB update +messages, as the back-end sends IOTLB miss messages for the guest virtual memory areas it needs to access. -.. _slave_communication: +.. _backend_communication: -Slave communication -------------------- +Back-end communication +---------------------- -An optional communication channel is provided if the slave declares +An optional communication channel is provided if the back-end declares ``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` protocol feature, to allow the -slave to make requests to the master. +back-end to make requests to the front-end. The fd is provided via ``VHOST_USER_SET_SLAVE_REQ_FD`` ancillary data. -A slave may then send ``VHOST_USER_SLAVE_*`` messages to the master +A back-end may then send ``VHOST_USER_SLAVE_*`` messages to the front-end using this fd communication channel. If ``VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD`` protocol feature is -negotiated, slave can send file descriptors (at most 8 descriptors in -each message) to master via ancillary data using this fd communication +negotiated, back-end can send file descriptors (at most 8 descriptors in +each message) to front-end via ancillary data using this fd communication channel. Inflight I/O tracking --------------------- -To support reconnecting after restart or crash, slave may need to +To support reconnecting after restart or crash, back-end may need to resubmit inflight I/Os. If virtqueue is processed in order, we can easily achieve that by getting the inflight descriptors from descriptor table (split virtqueue) or descriptor ring (packed @@ -556,18 +564,18 @@ virtqueue). However, it can't work when we process descriptors out-of-order because some entries which store the information of inflight descriptors in available ring (split virtqueue) or descriptor ring (packed virtqueue) might be overridden by new entries. To solve -this problem, slave need to allocate an extra buffer to store this -information of inflight descriptors and share it with master for +this problem, the back-end need to allocate an extra buffer to store this +information of inflight descriptors and share it with front-end for persistent. ``VHOST_USER_GET_INFLIGHT_FD`` and ``VHOST_USER_SET_INFLIGHT_FD`` are used to transfer this buffer -between master and slave. And the format of this buffer is described +between front-end and back-end. And the format of this buffer is described below: +---------------+---------------+-----+---------------+ | queue0 region | queue1 region | ... | queueN region | +---------------+---------------+-----+---------------+ -N is the number of available virtqueues. Slave could get it from num +N is the number of available virtqueues. The back-end could get it from num queues field of ``VhostUserInflight``. For split virtqueue, queue region can be implemented as: @@ -599,8 +607,8 @@ For split virtqueue, queue region can be implemented as: * Zero value indicates an uninitialized buffer */ uint16_t version; - /* The size of DescStateSplit array. It's equal to the virtqueue - * size. Slave could get it from queue size field of VhostUserInflight. */ + /* The size of DescStateSplit array. It's equal to the virtqueue size. + * The back-end could get it from queue size field of VhostUserInflight. */ uint16_t desc_num; /* The head of list that track the last batch of used descriptors. */ @@ -706,8 +714,8 @@ For packed virtqueue, queue region can be implemented as: * Zero value indicates an uninitialized buffer */ uint16_t version; - /* The size of DescStatePacked array. It's equal to the virtqueue - * size. Slave could get it from queue size field of VhostUserInflight. */ + /* The size of DescStatePacked array. It's equal to the virtqueue size. + * The back-end could get it from queue size field of VhostUserInflight. */ uint16_t desc_num; /* The head of free DescStatePacked entry list */ @@ -799,7 +807,7 @@ When reconnecting: #. Use ``old_used_wrap_counter`` to calculate the available flags #. If ``d.flags`` is not equal to the calculated flags value (means - slave has submitted the buffer to guest driver before crash, so + back-end has submitted the buffer to guest driver before crash, so it has to commit the in-progres update), set ``old_free_head``, ``old_used_idx``, ``old_used_wrap_counter`` to ``free_head``, ``used_idx``, ``used_wrap_counter`` @@ -828,11 +836,11 @@ cause the sending application(s) to block, it is not advised to use this feature unless absolutely necessary. It is also considered an error to negotiate this feature without also negotiating ``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` and ``VHOST_USER_PROTOCOL_F_REPLY_ACK``, -the former is necessary for getting a message channel from the slave -to the master, while the latter needs to be used with the in-band +the former is necessary for getting a message channel from the back-end +to the front-end, while the latter needs to be used with the in-band notification messages to block until they are processed, both to avoid blocking later and for proper processing (at least in the simulation -use case.) As it has no other way of signalling this error, the slave +use case.) As it has no other way of signalling this error, the back-end should close the connection as a response to a ``VHOST_USER_SET_PROTOCOL_FEATURES`` message that sets the in-band notifications feature flag without the other two. @@ -860,95 +868,101 @@ Protocol features #define VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS 15 #define VHOST_USER_PROTOCOL_F_STATUS 16 -Master message types --------------------- +Front-end message types +----------------------- ``VHOST_USER_GET_FEATURES`` :id: 1 :equivalent ioctl: ``VHOST_GET_FEATURES`` - :master payload: N/A - :slave payload: ``u64`` + :request payload: N/A + :reply payload: ``u64`` Get from the underlying vhost implementation the features bitmask. - Feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` signals slave support + Feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` signals back-end support for ``VHOST_USER_GET_PROTOCOL_FEATURES`` and ``VHOST_USER_SET_PROTOCOL_FEATURES``. ``VHOST_USER_SET_FEATURES`` :id: 2 :equivalent ioctl: ``VHOST_SET_FEATURES`` - :master payload: ``u64`` + :request payload: ``u64`` + :reply payload: N/A Enable features in the underlying vhost implementation using a bitmask. Feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` signals - slave support for ``VHOST_USER_GET_PROTOCOL_FEATURES`` and + back-end support for ``VHOST_USER_GET_PROTOCOL_FEATURES`` and ``VHOST_USER_SET_PROTOCOL_FEATURES``. ``VHOST_USER_GET_PROTOCOL_FEATURES`` :id: 15 :equivalent ioctl: ``VHOST_GET_FEATURES`` - :master payload: N/A - :slave payload: ``u64`` + :request payload: N/A + :reply payload: ``u64`` Get the protocol feature bitmask from the underlying vhost implementation. Only legal if feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` is present in - ``VHOST_USER_GET_FEATURES``. + ``VHOST_USER_GET_FEATURES``. It does not need to be acknowledged by + ``VHOST_USER_SET_FEATURES``. .. Note:: - Slave that reported ``VHOST_USER_F_PROTOCOL_FEATURES`` must + Back-ends that report ``VHOST_USER_F_PROTOCOL_FEATURES`` must support this message even before ``VHOST_USER_SET_FEATURES`` was called. ``VHOST_USER_SET_PROTOCOL_FEATURES`` :id: 16 :equivalent ioctl: ``VHOST_SET_FEATURES`` - :master payload: ``u64`` + :request payload: ``u64`` + :reply payload: N/A Enable protocol features in the underlying vhost implementation. Only legal if feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` is present in - ``VHOST_USER_GET_FEATURES``. + ``VHOST_USER_GET_FEATURES``. It does not need to be acknowledged by + ``VHOST_USER_SET_FEATURES``. .. Note:: - Slave that reported ``VHOST_USER_F_PROTOCOL_FEATURES`` must support + Back-ends that report ``VHOST_USER_F_PROTOCOL_FEATURES`` must support this message even before ``VHOST_USER_SET_FEATURES`` was called. ``VHOST_USER_SET_OWNER`` :id: 3 :equivalent ioctl: ``VHOST_SET_OWNER`` - :master payload: N/A + :request payload: N/A + :reply payload: N/A - Issued when a new connection is established. It sets the current - *master* as an owner of the session. This can be used on the *slave* + Issued when a new connection is established. It marks the sender + as the front-end that owns of the session. This can be used on the *back-end* as a "session start" flag. ``VHOST_USER_RESET_OWNER`` :id: 4 - :master payload: N/A + :request payload: N/A + :reply payload: N/A .. admonition:: Deprecated This is no longer used. Used to be sent to request disabling all - rings, but some clients interpreted it to also discard connection + rings, but some back-ends interpreted it to also discard connection state (this interpretation would lead to bugs). It is recommended - that clients either ignore this message, or use it to disable all + that back-ends either ignore this message, or use it to disable all rings. ``VHOST_USER_SET_MEM_TABLE`` :id: 5 :equivalent ioctl: ``VHOST_SET_MEM_TABLE`` - :master payload: memory regions description - :slave payload: (postcopy only) memory regions description + :request payload: memory regions description + :reply payload: (postcopy only) memory regions description - Sets the memory map regions on the slave so it can translate the + Sets the memory map regions on the back-end so it can translate the vring addresses. In the ancillary data there is an array of file descriptors for each memory mapped region. The size and ordering of the fds matches the number and ordering of memory regions. When ``VHOST_USER_POSTCOPY_LISTEN`` has been received, ``SET_MEM_TABLE`` replies with the bases of the memory mapped - regions to the master. The slave must have mmap'd the regions but + regions to the front-end. The back-end must have mmap'd the regions but not yet accessed them and should not yet generate a userfault event. @@ -962,12 +976,12 @@ Master message types ``VHOST_USER_SET_LOG_BASE`` :id: 6 :equivalent ioctl: ``VHOST_SET_LOG_BASE`` - :master payload: u64 - :slave payload: N/A + :request payload: u64 + :reply payload: N/A Sets logging shared memory space. - When slave has ``VHOST_USER_PROTOCOL_F_LOG_SHMFD`` protocol feature, + When the back-end has ``VHOST_USER_PROTOCOL_F_LOG_SHMFD`` protocol feature, the log memory fd is provided in the ancillary data of ``VHOST_USER_SET_LOG_BASE`` message, the size and offset of shared memory area provided in the message. @@ -975,44 +989,48 @@ Master message types ``VHOST_USER_SET_LOG_FD`` :id: 7 :equivalent ioctl: ``VHOST_SET_LOG_FD`` - :master payload: N/A + :request payload: N/A + :reply payload: N/A Sets the logging file descriptor, which is passed as ancillary data. ``VHOST_USER_SET_VRING_NUM`` :id: 8 :equivalent ioctl: ``VHOST_SET_VRING_NUM`` - :master payload: vring state description + :request payload: vring state description + :reply payload: N/A Set the size of the queue. ``VHOST_USER_SET_VRING_ADDR`` :id: 9 :equivalent ioctl: ``VHOST_SET_VRING_ADDR`` - :master payload: vring address description - :slave payload: N/A + :request payload: vring address description + :reply payload: N/A Sets the addresses of the different aspects of the vring. ``VHOST_USER_SET_VRING_BASE`` :id: 10 :equivalent ioctl: ``VHOST_SET_VRING_BASE`` - :master payload: vring state description + :request payload: vring state description + :reply payload: N/A Sets the base offset in the available vring. ``VHOST_USER_GET_VRING_BASE`` :id: 11 :equivalent ioctl: ``VHOST_USER_GET_VRING_BASE`` - :master payload: vring state description - :slave payload: vring state description + :request payload: vring state description + :reply payload: vring state description Get the available vring base offset. ``VHOST_USER_SET_VRING_KICK`` :id: 12 :equivalent ioctl: ``VHOST_SET_VRING_KICK`` - :master payload: ``u64`` + :request payload: ``u64`` + :reply payload: N/A Set the event file descriptor for adding buffers to the vring. It is passed in the ancillary data. @@ -1030,7 +1048,8 @@ Master message types ``VHOST_USER_SET_VRING_CALL`` :id: 13 :equivalent ioctl: ``VHOST_SET_VRING_CALL`` - :master payload: ``u64`` + :request payload: ``u64`` + :reply payload: N/A Set the event file descriptor to signal when buffers are used. It is passed in the ancillary data. @@ -1048,7 +1067,8 @@ Master message types ``VHOST_USER_SET_VRING_ERR`` :id: 14 :equivalent ioctl: ``VHOST_SET_VRING_ERR`` - :master payload: ``u64`` + :request payload: ``u64`` + :reply payload: N/A Set the event file descriptor to signal when error occurs. It is passed in the ancillary data. @@ -1065,10 +1085,10 @@ Master message types ``VHOST_USER_GET_QUEUE_NUM`` :id: 17 :equivalent ioctl: N/A - :master payload: N/A - :slave payload: u64 + :request payload: N/A + :reply payload: u64 - Query how many queues the backend supports. + Query how many queues the back-end supports. This request should be sent only when ``VHOST_USER_PROTOCOL_F_MQ`` is set in queried protocol features by @@ -1077,9 +1097,10 @@ Master message types ``VHOST_USER_SET_VRING_ENABLE`` :id: 18 :equivalent ioctl: N/A - :master payload: vring state description + :request payload: vring state description + :reply payload: N/A - Signal slave to enable or disable corresponding vring. + Signal the back-end to enable or disable corresponding vring. This request should be sent only when ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated. @@ -1087,9 +1108,10 @@ Master message types ``VHOST_USER_SEND_RARP`` :id: 19 :equivalent ioctl: N/A - :master payload: ``u64`` + :request payload: ``u64`` + :reply payload: N/A - Ask vhost user backend to broadcast a fake RARP to notify the migration + Ask vhost user back-end to broadcast a fake RARP to notify the migration is terminated for guest that does not support GUEST_ANNOUNCE. Only legal if feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` is @@ -1097,12 +1119,13 @@ Master message types ``VHOST_USER_PROTOCOL_F_RARP`` is present in ``VHOST_USER_GET_PROTOCOL_FEATURES``. The first 6 bytes of the payload contain the mac address of the guest to allow the vhost user - backend to construct and broadcast the fake RARP. + back-end to construct and broadcast the fake RARP. ``VHOST_USER_NET_SET_MTU`` :id: 20 :equivalent ioctl: N/A - :master payload: ``u64`` + :request payload: ``u64`` + :reply payload: N/A Set host MTU value exposed to the guest. @@ -1112,35 +1135,36 @@ Master message types ``VHOST_USER_PROTOCOL_F_NET_MTU`` is present in ``VHOST_USER_GET_PROTOCOL_FEATURES``. - If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, slave must + If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, the back-end must respond with zero in case the specified MTU is valid, or non-zero otherwise. ``VHOST_USER_SET_SLAVE_REQ_FD`` :id: 21 :equivalent ioctl: N/A - :master payload: N/A + :request payload: N/A + :reply payload: N/A - Set the socket file descriptor for slave initiated requests. It is passed + Set the socket file descriptor for back-end initiated requests. It is passed in the ancillary data. This request should be sent only when ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated, and protocol feature bit ``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` bit is present in ``VHOST_USER_GET_PROTOCOL_FEATURES``. If - ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, slave must + ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, the back-end must respond with zero for success, non-zero otherwise. ``VHOST_USER_IOTLB_MSG`` :id: 22 :equivalent ioctl: N/A (equivalent to ``VHOST_IOTLB_MSG`` message type) - :master payload: ``struct vhost_iotlb_msg`` - :slave payload: ``u64`` + :request payload: ``struct vhost_iotlb_msg`` + :reply payload: ``u64`` Send IOTLB messages with ``struct vhost_iotlb_msg`` as payload. - Master sends such requests to update and invalidate entries in the - device IOTLB. The slave has to acknowledge the request with sending + The front-end sends such requests to update and invalidate entries in the + device IOTLB. The back-end has to acknowledge the request with sending zero as ``u64`` payload for success, non-zero otherwise. This request should be send only when ``VIRTIO_F_IOMMU_PLATFORM`` @@ -1149,7 +1173,8 @@ Master message types ``VHOST_USER_SET_VRING_ENDIAN`` :id: 23 :equivalent ioctl: ``VHOST_SET_VRING_ENDIAN`` - :master payload: vring state description + :request payload: vring state description + :reply payload: N/A Set the endianness of a VQ for legacy devices. Little-endian is indicated with state.num set to 0 and big-endian is indicated with @@ -1159,42 +1184,42 @@ Master message types ``VHOST_USER_PROTOCOL_F_CROSS_ENDIAN`` has been negotiated. Backends that negotiated this feature should handle both endiannesses and expect this message once (per VQ) during device - configuration (ie. before the master starts the VQ). + configuration (ie. before the front-end starts the VQ). ``VHOST_USER_GET_CONFIG`` :id: 24 :equivalent ioctl: N/A - :master payload: virtio device config space - :slave payload: virtio device config space + :request payload: virtio device config space + :reply payload: virtio device config space When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, this message is - submitted by the vhost-user master to fetch the contents of the - virtio device configuration space, vhost-user slave's payload size - MUST match master's request, vhost-user slave uses zero length of - payload to indicate an error to vhost-user master. The vhost-user - master may cache the contents to avoid repeated + submitted by the vhost-user front-end to fetch the contents of the + virtio device configuration space, vhost-user back-end's payload size + MUST match the front-end's request, vhost-user back-end uses zero length of + payload to indicate an error to the vhost-user front-end. The vhost-user + front-end may cache the contents to avoid repeated ``VHOST_USER_GET_CONFIG`` calls. ``VHOST_USER_SET_CONFIG`` :id: 25 :equivalent ioctl: N/A - :master payload: virtio device config space - :slave payload: N/A + :request payload: virtio device config space + :reply payload: N/A When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, this message is - submitted by the vhost-user master when the Guest changes the virtio + submitted by the vhost-user front-end when the Guest changes the virtio device configuration space and also can be used for live migration - on the destination host. The vhost-user slave must check the flags - field, and slaves MUST NOT accept SET_CONFIG for read-only + on the destination host. The vhost-user back-end must check the flags + field, and back-ends MUST NOT accept SET_CONFIG for read-only configuration space fields unless the live migration bit is set. ``VHOST_USER_CREATE_CRYPTO_SESSION`` :id: 26 :equivalent ioctl: N/A - :master payload: crypto session description - :slave payload: crypto session description + :request payload: crypto session description + :reply payload: crypto session description - Create a session for crypto operation. The server side must return + Create a session for crypto operation. The back-end must return the session id, 0 or positive for success, negative for failure. This request should be sent only when ``VHOST_USER_PROTOCOL_F_CRYPTO_SESSION`` feature has been @@ -1204,7 +1229,8 @@ Master message types ``VHOST_USER_CLOSE_CRYPTO_SESSION`` :id: 27 :equivalent ioctl: N/A - :master payload: ``u64`` + :request payload: ``u64`` + :reply payload: N/A Close a session for crypto operation which was previously created by ``VHOST_USER_CREATE_CRYPTO_SESSION``. @@ -1216,20 +1242,21 @@ Master message types ``VHOST_USER_POSTCOPY_ADVISE`` :id: 28 - :master payload: N/A - :slave payload: userfault fd + :request payload: N/A + :reply payload: userfault fd - When ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported, the master - advises slave that a migration with postcopy enabled is underway, - the slave must open a userfaultfd for later use. Note that at this + When ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported, the front-end + advises back-end that a migration with postcopy enabled is underway, + the back-end must open a userfaultfd for later use. Note that at this stage the migration is still in precopy mode. ``VHOST_USER_POSTCOPY_LISTEN`` :id: 29 - :master payload: N/A + :request payload: N/A + :reply payload: N/A - Master advises slave that a transition to postcopy mode has - happened. The slave must ensure that shared memory is registered + The front-end advises back-end that a transition to postcopy mode has + happened. The back-end must ensure that shared memory is registered with userfaultfd to cause faulting of non-present pages. This is always sent sometime after a ``VHOST_USER_POSTCOPY_ADVISE``, @@ -1237,10 +1264,11 @@ Master message types ``VHOST_USER_POSTCOPY_END`` :id: 30 - :slave payload: ``u64`` + :request payload: N/A + :reply payload: ``u64`` - Master advises that postcopy migration has now completed. The slave - must disable the userfaultfd. The response is an acknowledgement + The front-end advises that postcopy migration has now completed. The back-end + must disable the userfaultfd. The reply is an acknowledgement only. When ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported, this message @@ -1252,156 +1280,181 @@ Master message types ``VHOST_USER_GET_INFLIGHT_FD`` :id: 31 :equivalent ioctl: N/A - :master payload: inflight description + :request payload: inflight description + :reply payload: N/A When ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD`` protocol feature has - been successfully negotiated, this message is submitted by master to - get a shared buffer from slave. The shared buffer will be used to - track inflight I/O by slave. QEMU should retrieve a new one when vm + been successfully negotiated, this message is submitted by the front-end to + get a shared buffer from back-end. The shared buffer will be used to + track inflight I/O by back-end. QEMU should retrieve a new one when vm reset. ``VHOST_USER_SET_INFLIGHT_FD`` :id: 32 :equivalent ioctl: N/A - :master payload: inflight description + :request payload: inflight description + :reply payload: N/A When ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD`` protocol feature has - been successfully negotiated, this message is submitted by master to - send the shared inflight buffer back to slave so that slave could - get inflight I/O after a crash or restart. + been successfully negotiated, this message is submitted by the front-end to + send the shared inflight buffer back to the back-end so that the back-end + could get inflight I/O after a crash or restart. ``VHOST_USER_GPU_SET_SOCKET`` :id: 33 :equivalent ioctl: N/A - :master payload: N/A + :request payload: N/A + :reply payload: N/A Sets the GPU protocol socket file descriptor, which is passed as - ancillary data. The GPU protocol is used to inform the master of + ancillary data. The GPU protocol is used to inform the front-end of rendering state and updates. See vhost-user-gpu.rst for details. ``VHOST_USER_RESET_DEVICE`` :id: 34 :equivalent ioctl: N/A - :master payload: N/A - :slave payload: N/A + :request payload: N/A + :reply payload: N/A - Ask the vhost user backend to disable all rings and reset all + Ask the vhost user back-end to disable all rings and reset all internal device state to the initial state, ready to be - reinitialized. The backend retains ownership of the device + reinitialized. The back-end retains ownership of the device throughout the reset operation. Only valid if the ``VHOST_USER_PROTOCOL_F_RESET_DEVICE`` protocol - feature is set by the backend. + feature is set by the back-end. ``VHOST_USER_VRING_KICK`` :id: 35 :equivalent ioctl: N/A - :slave payload: vring state description - :master payload: N/A + :request payload: vring state description + :reply payload: N/A When the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` protocol feature has been successfully negotiated, this message may be - submitted by the master to indicate that a buffer was added to + submitted by the front-end to indicate that a buffer was added to the vring instead of signalling it using the vring's kick file - descriptor or having the slave rely on polling. + descriptor or having the back-end rely on polling. The state.num field is currently reserved and must be set to 0. ``VHOST_USER_GET_MAX_MEM_SLOTS`` :id: 36 :equivalent ioctl: N/A - :slave payload: u64 + :request payload: N/A + :reply payload: u64 When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol feature has been successfully negotiated, this message is submitted - by master to the slave. The slave should return the message with a + by the front-end to the back-end. The back-end should return the message with a u64 payload containing the maximum number of memory slots for - QEMU to expose to the guest. The value returned by the backend + QEMU to expose to the guest. The value returned by the back-end will be capped at the maximum number of ram slots which can be supported by the target platform. ``VHOST_USER_ADD_MEM_REG`` :id: 37 :equivalent ioctl: N/A - :slave payload: single memory region description + :request payload: N/A + :reply payload: single memory region description When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol feature has been successfully negotiated, this message is submitted - by the master to the slave. The message payload contains a memory + by the front-end to the back-end. The message payload contains a memory region descriptor struct, describing a region of guest memory which - the slave device must map in. When the + the back-end device must map in. When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol feature has been successfully negotiated, along with the ``VHOST_USER_REM_MEM_REG`` message, this message is used to set and - update the memory tables of the slave device. + update the memory tables of the back-end device. Exactly one file descriptor from which the memory is mapped is passed in the ancillary data. - In postcopy mode (see ``VHOST_USER_POSTCOPY_LISTEN``), the slave - replies with the bases of the memory mapped region to the master. + In postcopy mode (see ``VHOST_USER_POSTCOPY_LISTEN``), the back-end + replies with the bases of the memory mapped region to the front-end. + For further details on postcopy, see ``VHOST_USER_SET_MEM_TABLE``. + They apply to ``VHOST_USER_ADD_MEM_REG`` accordingly. + + Exactly one file descriptor from which the memory is mapped is + passed in the ancillary data. + + In postcopy mode (see ``VHOST_USER_POSTCOPY_LISTEN``), the back-end + replies with the bases of the memory mapped region to the front-end. For further details on postcopy, see ``VHOST_USER_SET_MEM_TABLE``. They apply to ``VHOST_USER_ADD_MEM_REG`` accordingly. ``VHOST_USER_REM_MEM_REG`` :id: 38 :equivalent ioctl: N/A - :slave payload: single memory region description + :request payload: N/A + :reply payload: single memory region description When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol feature has been successfully negotiated, this message is submitted - by the master to the slave. The message payload contains a memory + by the front-end to the back-end. The message payload contains a memory region descriptor struct, describing a region of guest memory which - the slave device must unmap. When the + the back-end device must unmap. When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol feature has been successfully negotiated, along with the ``VHOST_USER_ADD_MEM_REG`` message, this message is used to set and - update the memory tables of the slave device. + update the memory tables of the back-end device. The memory region to be removed is identified by its guest address, user address and size. The mmap offset is ignored. No file descriptors SHOULD be passed in the ancillary data. For - compatibility with existing incorrect implementations, the slave MAY + compatibility with existing incorrect implementations, the back-end MAY accept messages with one file descriptor. If a file descriptor is - passed, the slave MUST close it without using it otherwise. + passed, the back-end MUST close it without using it otherwise. + + The memory region to be removed is identified by its guest address, + user address and size. The mmap offset is ignored. + + No file descriptors SHOULD be passed in the ancillary data. For + compatibility with existing incorrect implementations, the back-end MAY + accept messages with one file descriptor. If a file descriptor is + passed, the back-end MUST close it without using it otherwise. ``VHOST_USER_SET_STATUS`` :id: 39 :equivalent ioctl: VHOST_VDPA_SET_STATUS - :slave payload: N/A - :master payload: ``u64`` + :request payload: ``u64`` + :reply payload: N/A When the ``VHOST_USER_PROTOCOL_F_STATUS`` protocol feature has been - successfully negotiated, this message is submitted by the master to - notify the backend with updated device status as defined in the Virtio + successfully negotiated, this message is submitted by the front-end to + notify the back-end with updated device status as defined in the Virtio specification. ``VHOST_USER_GET_STATUS`` :id: 40 :equivalent ioctl: VHOST_VDPA_GET_STATUS - :slave payload: ``u64`` - :master payload: N/A + :request payload: N/A + :reply payload: ``u64`` When the ``VHOST_USER_PROTOCOL_F_STATUS`` protocol feature has been - successfully negotiated, this message is submitted by the master to - query the backend for its device status as defined in the Virtio + successfully negotiated, this message is submitted by the front-end to + query the back-end for its device status as defined in the Virtio specification. -Slave message types -------------------- +Back-end message types +---------------------- + +For this type of message, the request is sent by the back-end and the reply +is sent by the front-end. ``VHOST_USER_SLAVE_IOTLB_MSG`` :id: 1 :equivalent ioctl: N/A (equivalent to ``VHOST_IOTLB_MSG`` message type) - :slave payload: ``struct vhost_iotlb_msg`` - :master payload: N/A + :request payload: ``struct vhost_iotlb_msg`` + :reply payload: N/A Send IOTLB messages with ``struct vhost_iotlb_msg`` as payload. - Slave sends such requests to notify of an IOTLB miss, or an IOTLB + The back-end sends such requests to notify of an IOTLB miss, or an IOTLB access failure. If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is - negotiated, and slave set the ``VHOST_USER_NEED_REPLY`` flag, master + negotiated, and back-end set the ``VHOST_USER_NEED_REPLY`` flag, the front-end must respond with zero when operation is successfully completed, or non-zero otherwise. This request should be send only when ``VIRTIO_F_IOMMU_PLATFORM`` feature has been successfully @@ -1410,23 +1463,23 @@ Slave message types ``VHOST_USER_SLAVE_CONFIG_CHANGE_MSG`` :id: 2 :equivalent ioctl: N/A - :slave payload: N/A - :master payload: N/A + :request payload: N/A + :reply payload: N/A When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, vhost-user - slave sends such messages to notify that the virtio device's + back-end sends such messages to notify that the virtio device's configuration space has changed, for those host devices which can support such feature, host driver can send ``VHOST_USER_GET_CONFIG`` - message to slave to get the latest content. If - ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, and slave set the - ``VHOST_USER_NEED_REPLY`` flag, master must respond with zero when + message to the back-end to get the latest content. If + ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, and the back-end sets the + ``VHOST_USER_NEED_REPLY`` flag, the front-end must respond with zero when operation is successfully completed, or non-zero otherwise. ``VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG`` :id: 3 :equivalent ioctl: N/A - :slave payload: vring area description - :master payload: N/A + :request payload: vring area description + :reply payload: N/A Sets host notifier for a specified queue. The queue index is contained in the ``u64`` field of the vring area description. The @@ -1437,7 +1490,7 @@ Slave message types description. QEMU can mmap the file descriptor based on the size and offset to get a memory range. Registering a host notifier means mapping this memory range to the VM as the specified queue's notify - MMIO region. Slave sends this request to tell QEMU to de-register + MMIO region. The back-end sends this request to tell QEMU to de-register the existing notifier if any and register the new notifier if the request is sent with a file descriptor. @@ -1448,28 +1501,28 @@ Slave message types ``VHOST_USER_SLAVE_VRING_CALL`` :id: 4 :equivalent ioctl: N/A - :slave payload: vring state description - :master payload: N/A + :request payload: vring state description + :reply payload: N/A When the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` protocol feature has been successfully negotiated, this message may be - submitted by the slave to indicate that a buffer was used from + submitted by the back-end to indicate that a buffer was used from the vring instead of signalling this using the vring's call file - descriptor or having the master relying on polling. + descriptor or having the front-end relying on polling. The state.num field is currently reserved and must be set to 0. ``VHOST_USER_SLAVE_VRING_ERR`` :id: 5 :equivalent ioctl: N/A - :slave payload: vring state description - :master payload: N/A + :request payload: vring state description + :reply payload: N/A When the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` protocol feature has been successfully negotiated, this message may be - submitted by the slave to indicate that an error occurred on the + submitted by the back-end to indicate that an error occurred on the specific vring, instead of signalling the error file descriptor - set by the master via ``VHOST_USER_SET_VRING_ERR``. + set by the front-end via ``VHOST_USER_SET_VRING_ERR``. The state.num field is currently reserved and must be set to 0. @@ -1480,21 +1533,21 @@ VHOST_USER_PROTOCOL_F_REPLY_ACK The original vhost-user specification only demands replies for certain commands. This differs from the vhost protocol implementation where -commands are sent over an ``ioctl()`` call and block until the client +commands are sent over an ``ioctl()`` call and block until the back-end has completed. With this protocol extension negotiated, the sender (QEMU) can set the ``need_reply`` [Bit 3] flag to any command. This indicates that the -client MUST respond with a Payload ``VhostUserMsg`` indicating success +back-end MUST respond with a Payload ``VhostUserMsg`` indicating success or failure. The payload should be set to zero on success or non-zero on failure, unless the message already has an explicit reply body. -The response payload gives QEMU a deterministic indication of the result +The reply payload gives QEMU a deterministic indication of the result of the command. Today, QEMU is expected to terminate the main vhost-user loop upon receiving such errors. In future, qemu could be taught to be more resilient for selective requests. -For the message types that already solicit a reply from the client, +For the message types that already solicit a reply from the back-end, the presence of ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` or need_reply bit being set brings no behavioural change. (See the Communication_ section for details.) @@ -1504,26 +1557,26 @@ section for details.) Backend program conventions =========================== -vhost-user backends can provide various devices & services and may +vhost-user back-ends can provide various devices & services and may need to be configured manually depending on the use case. However, it is a good idea to follow the conventions listed here when possible. Users, QEMU or libvirt, can then rely on some common behaviour to avoid heterogeneous configuration and management of the -backend programs and facilitate interoperability. +back-end programs and facilitate interoperability. -Each backend installed on a host system should come with at least one +Each back-end installed on a host system should come with at least one JSON file that conforms to the vhost-user.json schema. Each file -informs the management applications about the backend type, and binary +informs the management applications about the back-end type, and binary location. In addition, it defines rules for management apps for -picking the highest priority backend when multiple match the search +picking the highest priority back-end when multiple match the search criteria (see ``@VhostUserBackend`` documentation in the schema file). -If the backend is not capable of enabling a requested feature on the +If the back-end is not capable of enabling a requested feature on the host (such as 3D acceleration with virgl), or the initialization -failed, the backend should fail to start early and exit with a status +failed, the back-end should fail to start early and exit with a status != 0. It may also print a message to stderr for further details. -The backend program must not daemonize itself, but it may be +The back-end program must not daemonize itself, but it may be daemonized by the management layer. It may also have a restricted access to the system. @@ -1531,7 +1584,7 @@ File descriptors 0, 1 and 2 will exist, and have regular stdin/stdout/stderr usage (they may have been redirected to /dev/null by the management layer, or to a log handler). -The backend program must end (as quickly and cleanly as possible) when +The back-end program must end (as quickly and cleanly as possible) when the SIGTERM signal is received. Eventually, it may receive SIGKILL by the management layer after a few seconds. @@ -1545,15 +1598,15 @@ are mandatory, unless explicitly said differently: --fd=FDNUM - When this argument is given, the backend program is started with the + When this argument is given, the back-end program is started with the vhost-user socket as file descriptor FDNUM. It is incompatible with --socket-path. --print-capabilities - Output to stdout the backend capabilities in JSON format, and then + Output to stdout the back-end capabilities in JSON format, and then exit successfully. Other options and arguments should be ignored, and - the backend program should not perform its normal function. The + the back-end program should not perform its normal function. The capabilities can be reported dynamically depending on the host capabilities. diff --git a/docs/system/device-emulation.rst b/docs/system/device-emulation.rst index ae8dd233e8..3b729b920d 100644 --- a/docs/system/device-emulation.rst +++ b/docs/system/device-emulation.rst @@ -84,6 +84,7 @@ Emulated Devices devices/can.rst devices/ccid.rst + devices/cxl.rst devices/ivshmem.rst devices/net.rst devices/nvme.rst diff --git a/docs/system/devices/cxl.rst b/docs/system/devices/cxl.rst new file mode 100644 index 0000000000..9293cbf01a --- /dev/null +++ b/docs/system/devices/cxl.rst @@ -0,0 +1,302 @@ +Compute Express Link (CXL) +========================== +From the view of a single host, CXL is an interconnect standard that +targets accelerators and memory devices attached to a CXL host. +This description will focus on those aspects visible either to +software running on a QEMU emulated host or to the internals of +functional emulation. As such, it will skip over many of the +electrical and protocol elements that would be more of interest +for real hardware and will dominate more general introductions to CXL. +It will also completely ignore the fabric management aspects of CXL +by considering only a single host and a static configuration. + +CXL shares many concepts and much of the infrastructure of PCI Express, +with CXL Host Bridges, which have CXL Root Ports which may be directly +attached to CXL or PCI End Points. Alternatively there may be CXL Switches +with CXL and PCI Endpoints attached below them. In many cases additional +control and capabilities are exposed via PCI Express interfaces. +This sharing of interfaces and hence emulation code is is reflected +in how the devices are emulated in QEMU. In most cases the various +CXL elements are built upon an equivalent PCIe devices. + +CXL devices support the following interfaces: + +* Most conventional PCIe interfaces + + - Configuration space access + - BAR mapped memory accesses used for registers and mailboxes. + - MSI/MSI-X + - AER + - DOE mailboxes + - IDE + - Many other PCI express defined interfaces.. + +* Memory operations + + - Equivalent of accessing DRAM / NVDIMMs. Any access / feature + supported by the host for normal memory should also work for + CXL attached memory devices. + +* Cache operations. The are mostly irrelevant to QEMU emulation as + QEMU is not emulating a coherency protocol. Any emulation related + to these will be device specific and is out of the scope of this + document. + +CXL 2.0 Device Types +-------------------- +CXL 2.0 End Points are often categorized into three types. + +**Type 1:** These support coherent caching of host memory. Example might +be a crypto accelerators. May also have device private memory accessible +via means such as PCI memory reads and writes to BARs. + +**Type 2:** These support coherent caching of host memory and host +managed device memory (HDM) for which the coherency protocol is managed +by the host. This is a complex topic, so for more information on CXL +coherency see the CXL 2.0 specification. + +**Type 3 Memory devices:** These devices act as a means of attaching +additional memory (HDM) to a CXL host including both volatile and +persistent memory. The CXL topology may support interleaving across a +number of Type 3 memory devices using HDM Decoders in the host, host +bridge, switch upstream port and endpoints. + +Scope of CXL emulation in QEMU +------------------------------ +The focus of CXL emulation is CXL revision 2.0 and later. Earlier CXL +revisions defined a smaller set of features, leaving much of the control +interface as implementation defined or device specific, making generic +emulation challenging with host specific firmware being responsible +for setup and the Endpoints being presented to operating systems +as Root Complex Integrated End Points. CXL rev 2.0 looks a lot +more like PCI Express, with fully specified discoverability +of the CXL topology. + +CXL System components +---------------------- +A CXL system is made up a Host with a number of 'standard components' +the control and capabilities of which are discoverable by system software +using means described in the CXL 2.0 specification. + +CXL Fixed Memory Windows (CFMW) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +A CFMW consists of a particular range of Host Physical Address space +which is routed to particular CXL Host Bridges. At time of generic +software initialization it will have a particularly interleaving +configuration and associated Quality of Serice Throtling Group (QTG). +This information is available to system software, when making +decisions about how to configure interleave across available CXL +memory devices. It is provide as CFMW Structures (CFMWS) in +the CXL Early Discovery Table, an ACPI table. + +Note: QTG 0 is the only one currently supported in QEMU. + +CXL Host Bridge (CXL HB) +~~~~~~~~~~~~~~~~~~~~~~~~ +A CXL host bridge is similar to the PCIe equivalent, but with a +specification defined register interface called CXL Host Bridge +Component Registers (CHBCR). The location of this CHBCR MMIO +space is described to system software via a CXL Host Bridge +Structure (CHBS) in the CEDT ACPI table. The actual interfaces +are identical to those used for other parts of the CXL heirarchy +as CXL Component Registers in PCI BARs. + +Interfaces provided include: + +* Configuration of HDM Decoders to route CXL Memory accesses with + a particularly Host Physical Address range to the target port + below which the CXL device servicing that address lies. This + may be a mapping to a single Root Port (RP) or across a set of + target RPs. + +CXL Root Ports (CXL RP) +~~~~~~~~~~~~~~~~~~~~~~~ +A CXL Root Port servers te same purpose as a PCIe Root Port. +There are a number of CXL specific Designated Vendor Specific +Extended Capabilities (DVSEC) in PCIe Configuration Space +and associated component register access via PCI bars. + +CXL Switch +~~~~~~~~~~ +Not yet implemented in QEMU. + +Here we consider a simple CXL switch with only a single +virtual hierarchy. Whilst more complex devices exist, their +visibility to a particular host is generally the same as for +a simple switch design. Hosts often have no awareness +of complex rerouting and device pooling, they simply see +devices being hot added or hot removed. + +A CXL switch has a similar architecture to those in PCIe, +with a single upstream port, internal PCI bus and multiple +downstream ports. + +Both the CXL upstream and downstream ports have CXL specific +DVSECs in configuration space, and component registers in PCI +BARs. The Upstream Port has the configuration interfaces for +the HDM decoders which route incoming memory accesses to the +appropriate downstream port. + +CXL Memory Devices - Type 3 +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +CXL type 3 devices use a PCI class code and are intended to be supported +by a generic operating system driver. They have HDM decoders +though in these EP devices, the decoder is reponsible not for +routing but for translation of the incoming host physical address (HPA) +into a Device Physical Address (DPA). + +CXL Memory Interleave +--------------------- +To understand the interaction of different CXL hardware components which +are emulated in QEMU, let us consider a memory read in a fully configured +CXL topology. Note that system software is responsible for configuration +of all components with the exception of the CFMWs. System software is +responsible for allocating appropriate ranges from within the CFMWs +and exposing those via normal memory configurations as would be done +for system RAM. + +Example system Topology. x marks the match in each decoder level:: + + |<------------------SYSTEM PHYSICAL ADDRESS MAP (1)----------------->| + | __________ __________________________________ __________ | + | | | | | | | | + | | CFMW 0 | | CXL Fixed Memory Window 1 | | CFMW 1 | | + | | HB0 only | | Configured to interleave memory | | HB1 only | | + | | | | memory accesses across HB0/HB1 | | | | + | |__________| |_____x____________________________| |__________| | + | | | | + | | | | + | | | | + | Interleave Decoder | | + | Matches this HB | | + \_____________| |_____________/ + __________|__________ _____|_______________ + | | | | + (2) | CXL HB 0 | | CXL HB 1 | + | HB IntLv Decoders | | HB IntLv Decoders | + | PCI/CXL Root Bus 0c | | PCI/CXL Root Bus 0d | + | | | | + |___x_________________| |_____________________| + | | | | + | | | | + A HB 0 HDM Decoder | | | + matches this Port | | | + | | | | + ___________|___ __________|__ __|_________ ___|_________ + (3)| Root Port 0 | | Root Port 1 | | Root Port 2| | Root Port 3 | + | Appears in | | Appears in | | Appears in | | Appear in | + | PCI topology | | PCI Topology| | PCI Topo | | PCI Topo | + | As 0c:00.0 | | as 0c:01.0 | | as de:00.0 | | as de:01.0 | + |_______________| |_____________| |____________| |_____________| + | | | | + | | | | + _____|_________ ______|______ ______|_____ ______|_______ + (4)| x | | | | | | | + | CXL Type3 0 | | CXL Type3 1 | | CXL type3 2| | CLX Type 3 3 | + | | | | | | | | + | PMEM0(Vol LSA)| | PMEM1 (...) | | PMEM2 (...)| | PMEM3 (...) | + | Decoder to go | | | | | | | + | from host PA | | PCI 0e:00.0 | | PCI df:00.0| | PCI e0:00.0 | + | to device PA | | | | | | | + | PCI as 0d:00.0| | | | | | | + |_______________| |_____________| |____________| |______________| + +Notes: + +(1) **3 CXL Fixed Memory Windows (CFMW)** corresponding to different + ranges of the system physical address map. Each CFMW has + particular interleave setup across the CXL Host Bridges (HB) + CFMW0 provides uninterleaved access to HB0, CFW2 provides + uninterleaved acess to HB1. CFW1 provides interleaved memory access + across HB0 and HB1. + +(2) **Two CXL Host Bridges**. Each of these has 2 CXL Root Ports and + programmable HDM decoders to route memory accesses either to + a single port or interleave them across multiple ports. + A complex configuration here, might be to use the following HDM + decoders in HB0. HDM0 routes CFMW0 requests to RP0 and hence + part of CXL Type3 0. HDM1 routes CFMW0 requests from a + different region of the CFMW0 PA range to RP2 and hence part + of CXL Type 3 1. HDM2 routes yet another PA range from within + CFMW0 to be interleaved across RP0 and RP1, providing 2 way + interleave of part of the memory provided by CXL Type3 0 and + CXL Type 3 1. HDM3 routes those interleaved accesses from + CFMW1 that target HB0 to RP 0 and another part of the memory of + CXL Type 3 0 (as part of a 2 way interleave at the system level + across for example CXL Type3 0 and CXL Type3 2. + HDM4 is used to enable system wide 4 way interleave across all + the present CXL type3 devices, by interleaving those (interleaved) + requests that HB0 receives from from CFMW1 across RP 0 and + RP 1 and hence to yet more regions of the memory of the + attached Type3 devices. Note this is a representative subset + of the full range of possible HDM decoder configurations in this + topology. + +(3) **Four CXL Root Ports.** In this case the CXL Type 3 devices are + directly attached to these ports. + +(4) **Four CXL Type3 memory expansion devices.** These will each have + HDM decoders, but in this case rather than performing interleave + they will take the Host Physical Addresses of accesses and map + them to their own local Device Physical Address Space (DPA). + +Example command lines +--------------------- +A very simple setup with just one directly attached CXL Type 3 device:: + + qemu-system-aarch64 -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 -cpu max \ + ... + -object memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest.raw,size=256M \ + -object memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa.raw,size=256M \ + -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \ + -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \ + -device cxl-type3,bus=root_port13,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \ + -cxl-fixed-memory-window targets.0=cxl.1,size=4G + +A setup suitable for 4 way interleave. Only one fixed window provided, to enable 2 way +interleave across 2 CXL host bridges. Each host bridge has 2 CXL Root Ports, with +the CXL Type3 device directly attached (no switches).:: + + qemu-system-aarch64 -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 -cpu max \ + ... + -object memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest.raw,size=256M \ + -object memory-backend-file,id=cxl-mem2,share=on,mem-path=/tmp/cxltest2.raw,size=256M \ + -object memory-backend-file,id=cxl-mem3,share=on,mem-path=/tmp/cxltest3.raw,size=256M \ + -object memory-backend-file,id=cxl-mem4,share=on,mem-path=/tmp/cxltest4.raw,size=256M \ + -object memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa.raw,size=256M \ + -object memory-backend-file,id=cxl-lsa2,share=on,mem-path=/tmp/lsa2.raw,size=256M \ + -object memory-backend-file,id=cxl-lsa3,share=on,mem-path=/tmp/lsa3.raw,size=256M \ + -object memory-backend-file,id=cxl-lsa4,share=on,mem-path=/tmp/lsa4.raw,size=256M \ + -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \ + -device pxb-cxl,bus_nr=222,bus=pcie.0,id=cxl.2 \ + -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \ + -device cxl-type3,bus=root_port13,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \ + -device cxl-rp,port=1,bus=cxl.1,id=root_port14,chassis=0,slot=3 \ + -device cxl-type3,bus=root_port14,memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem1 \ + -device cxl-rp,port=0,bus=cxl.2,id=root_port15,chassis=0,slot=5 \ + -device cxl-type3,bus=root_port15,memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem2 \ + -device cxl-rp,port=1,bus=cxl.2,id=root_port16,chassis=0,slot=6 \ + -device cxl-type3,bus=root_port16,memdev=cxl-mem4,lsa=cxl-lsa4,id=cxl-pmem3 \ + -cxl-fixed-memory-window targets.0=cxl.1,targets.1=cxl.2,size=4G,interleave-granularity=8k + +Kernel Configuration Options +---------------------------- + +In Linux 5.18 the followings options are necessary to make use of +OS management of CXL memory devices as described here. + +* CONFIG_CXL_BUS +* CONFIG_CXL_PCI +* CONFIG_CXL_ACPI +* CONFIG_CXL_PMEM +* CONFIG_CXL_MEM +* CONFIG_CXL_PORT +* CONFIG_CXL_REGION + +References +---------- + + - Consortium website for specifications etc: + http://www.computeexpresslink.org + - Compute Express link Revision 2 specification, October 2020 + - CEDT CFMWS & QTG _DSM ECN May 2021 diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c index 54ee93b71f..5f522e68e9 100644 --- a/hw/9pfs/virtio-9p-device.c +++ b/hw/9pfs/virtio-9p-device.c @@ -216,7 +216,7 @@ static void virtio_9p_device_realize(DeviceState *dev, Error **errp) } v->config_size = sizeof(struct virtio_9p_config) + strlen(s->fsconf.tag); - virtio_init(vdev, "virtio-9p", VIRTIO_ID_9P, v->config_size); + virtio_init(vdev, VIRTIO_ID_9P, v->config_size); v->vq = virtio_add_queue(vdev, MAX_REQ, handle_9p_output); } diff --git a/hw/Kconfig b/hw/Kconfig index ad20cce0a9..50e0952889 100644 --- a/hw/Kconfig +++ b/hw/Kconfig @@ -6,6 +6,7 @@ source audio/Kconfig source block/Kconfig source char/Kconfig source core/Kconfig +source cxl/Kconfig source display/Kconfig source dma/Kconfig source gpio/Kconfig diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig index 19caebde6c..3703aca212 100644 --- a/hw/acpi/Kconfig +++ b/hw/acpi/Kconfig @@ -5,6 +5,7 @@ config ACPI_X86 bool select ACPI select ACPI_NVDIMM + select ACPI_CXL select ACPI_CPU_HOTPLUG select ACPI_MEMORY_HOTPLUG select ACPI_HMAT @@ -66,3 +67,7 @@ config ACPI_ERST bool default y depends on ACPI && PCI + +config ACPI_CXL + bool + depends on ACPI diff --git a/hw/acpi/cxl-stub.c b/hw/acpi/cxl-stub.c new file mode 100644 index 0000000000..15bc21076b --- /dev/null +++ b/hw/acpi/cxl-stub.c @@ -0,0 +1,12 @@ + +/* + * Stubs for ACPI platforms that don't support CXl + */ +#include "qemu/osdep.h" +#include "hw/acpi/aml-build.h" +#include "hw/acpi/cxl.h" + +void build_cxl_osc_method(Aml *dev) +{ + g_assert_not_reached(); +} diff --git a/hw/acpi/cxl.c b/hw/acpi/cxl.c new file mode 100644 index 0000000000..31d5235136 --- /dev/null +++ b/hw/acpi/cxl.c @@ -0,0 +1,257 @@ +/* + * CXL ACPI Implementation + * + * Copyright(C) 2020 Intel Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see + */ + +#include "qemu/osdep.h" +#include "hw/sysbus.h" +#include "hw/pci/pci_bridge.h" +#include "hw/pci/pci_host.h" +#include "hw/cxl/cxl.h" +#include "hw/mem/memory-device.h" +#include "hw/acpi/acpi.h" +#include "hw/acpi/aml-build.h" +#include "hw/acpi/bios-linker-loader.h" +#include "hw/acpi/cxl.h" +#include "qapi/error.h" +#include "qemu/uuid.h" + +static void cedt_build_chbs(GArray *table_data, PXBDev *cxl) +{ + SysBusDevice *sbd = SYS_BUS_DEVICE(cxl->cxl.cxl_host_bridge); + struct MemoryRegion *mr = sbd->mmio[0].memory; + + /* Type */ + build_append_int_noprefix(table_data, 0, 1); + + /* Reserved */ + build_append_int_noprefix(table_data, 0, 1); + + /* Record Length */ + build_append_int_noprefix(table_data, 32, 2); + + /* UID - currently equal to bus number */ + build_append_int_noprefix(table_data, cxl->bus_nr, 4); + + /* Version */ + build_append_int_noprefix(table_data, 1, 4); + + /* Reserved */ + build_append_int_noprefix(table_data, 0, 4); + + /* Base - subregion within a container that is in PA space */ + build_append_int_noprefix(table_data, mr->container->addr + mr->addr, 8); + + /* Length */ + build_append_int_noprefix(table_data, memory_region_size(mr), 8); +} + +/* + * CFMWS entries in CXL 2.0 ECN: CEDT CFMWS & QTG _DSM. + * Interleave ways encoding in CXL 2.0 ECN: 3, 6, 12 and 16-way memory + * interleaving. + */ +static void cedt_build_cfmws(GArray *table_data, MachineState *ms) +{ + CXLState *cxls = ms->cxl_devices_state; + GList *it; + + for (it = cxls->fixed_windows; it; it = it->next) { + CXLFixedWindow *fw = it->data; + int i; + + /* Type */ + build_append_int_noprefix(table_data, 1, 1); + + /* Reserved */ + build_append_int_noprefix(table_data, 0, 1); + + /* Record Length */ + build_append_int_noprefix(table_data, 36 + 4 * fw->num_targets, 2); + + /* Reserved */ + build_append_int_noprefix(table_data, 0, 4); + + /* Base HPA */ + build_append_int_noprefix(table_data, fw->mr.addr, 8); + + /* Window Size */ + build_append_int_noprefix(table_data, fw->size, 8); + + /* Host Bridge Interleave Ways */ + build_append_int_noprefix(table_data, fw->enc_int_ways, 1); + + /* Host Bridge Interleave Arithmetic */ + build_append_int_noprefix(table_data, 0, 1); + + /* Reserved */ + build_append_int_noprefix(table_data, 0, 2); + + /* Host Bridge Interleave Granularity */ + build_append_int_noprefix(table_data, fw->enc_int_gran, 4); + + /* Window Restrictions */ + build_append_int_noprefix(table_data, 0x0f, 2); /* No restrictions */ + + /* QTG ID */ + build_append_int_noprefix(table_data, 0, 2); + + /* Host Bridge List (list of UIDs - currently bus_nr) */ + for (i = 0; i < fw->num_targets; i++) { + g_assert(fw->target_hbs[i]); + build_append_int_noprefix(table_data, fw->target_hbs[i]->bus_nr, 4); + } + } +} + +static int cxl_foreach_pxb_hb(Object *obj, void *opaque) +{ + Aml *cedt = opaque; + + if (object_dynamic_cast(obj, TYPE_PXB_CXL_DEVICE)) { + cedt_build_chbs(cedt->buf, PXB_CXL_DEV(obj)); + } + + return 0; +} + +void cxl_build_cedt(MachineState *ms, GArray *table_offsets, GArray *table_data, + BIOSLinker *linker, const char *oem_id, + const char *oem_table_id) +{ + Aml *cedt; + AcpiTable table = { .sig = "CEDT", .rev = 1, .oem_id = oem_id, + .oem_table_id = oem_table_id }; + + acpi_add_table(table_offsets, table_data); + acpi_table_begin(&table, table_data); + cedt = init_aml_allocator(); + + /* reserve space for CEDT header */ + + object_child_foreach_recursive(object_get_root(), cxl_foreach_pxb_hb, cedt); + cedt_build_cfmws(cedt->buf, ms); + + /* copy AML table into ACPI tables blob and patch header there */ + g_array_append_vals(table_data, cedt->buf->data, cedt->buf->len); + free_aml_allocator(); + + acpi_table_end(linker, &table); +} + +static Aml *__build_cxl_osc_method(void) +{ + Aml *method, *if_uuid, *else_uuid, *if_arg1_not_1, *if_cxl, *if_caps_masked; + Aml *a_ctrl = aml_local(0); + Aml *a_cdw1 = aml_name("CDW1"); + + method = aml_method("_OSC", 4, AML_NOTSERIALIZED); + /* CDW1 is used for the return value so is present whether or not a match occurs */ + aml_append(method, aml_create_dword_field(aml_arg(3), aml_int(0), "CDW1")); + + /* + * Generate shared section between: + * CXL 2.0 - 9.14.2.1.4 and + * PCI Firmware Specification 3.0 + * 4.5.1. _OSC Interface for PCI Host Bridge Devices + * The _OSC interface for a PCI/PCI-X/PCI Express hierarchy is + * identified by the Universal Unique IDentifier (UUID) + * 33DB4D5B-1FF7-401C-9657-7441C03DD766 + * The _OSC interface for a CXL Host bridge is + * identified by the UUID 68F2D50B-C469-4D8A-BD3D-941A103FD3FC + * A CXL Host bridge is compatible with a PCI host bridge so + * for the shared section match both. + */ + if_uuid = aml_if( + aml_lor(aml_equal(aml_arg(0), + aml_touuid("33DB4D5B-1FF7-401C-9657-7441C03DD766")), + aml_equal(aml_arg(0), + aml_touuid("68F2D50B-C469-4D8A-BD3D-941A103FD3FC")))); + aml_append(if_uuid, aml_create_dword_field(aml_arg(3), aml_int(4), "CDW2")); + aml_append(if_uuid, aml_create_dword_field(aml_arg(3), aml_int(8), "CDW3")); + + aml_append(if_uuid, aml_store(aml_name("CDW3"), a_ctrl)); + + /* + * + * Allows OS control for all 5 features: + * PCIeHotplug SHPCHotplug PME AER PCIeCapability + */ + aml_append(if_uuid, aml_and(a_ctrl, aml_int(0x1F), a_ctrl)); + + /* + * Check _OSC revision. + * PCI Firmware specification 3.3 and CXL 2.0 both use revision 1 + * Unknown Revision is CDW1 - BIT (3) + */ + if_arg1_not_1 = aml_if(aml_lnot(aml_equal(aml_arg(1), aml_int(0x1)))); + aml_append(if_arg1_not_1, aml_or(a_cdw1, aml_int(0x08), a_cdw1)); + aml_append(if_uuid, if_arg1_not_1); + + if_caps_masked = aml_if(aml_lnot(aml_equal(aml_name("CDW3"), a_ctrl))); + + /* Capability bits were masked */ + aml_append(if_caps_masked, aml_or(a_cdw1, aml_int(0x10), a_cdw1)); + aml_append(if_uuid, if_caps_masked); + + aml_append(if_uuid, aml_store(aml_name("CDW2"), aml_name("SUPP"))); + aml_append(if_uuid, aml_store(aml_name("CDW3"), aml_name("CTRL"))); + + /* Update DWORD3 (the return value) */ + aml_append(if_uuid, aml_store(a_ctrl, aml_name("CDW3"))); + + /* CXL only section as per CXL 2.0 - 9.14.2.1.4 */ + if_cxl = aml_if(aml_equal( + aml_arg(0), aml_touuid("68F2D50B-C469-4D8A-BD3D-941A103FD3FC"))); + /* CXL support field */ + aml_append(if_cxl, aml_create_dword_field(aml_arg(3), aml_int(12), "CDW4")); + /* CXL capabilities */ + aml_append(if_cxl, aml_create_dword_field(aml_arg(3), aml_int(16), "CDW5")); + aml_append(if_cxl, aml_store(aml_name("CDW4"), aml_name("SUPC"))); + aml_append(if_cxl, aml_store(aml_name("CDW5"), aml_name("CTRC"))); + + /* CXL 2.0 Port/Device Register access */ + aml_append(if_cxl, + aml_or(aml_name("CDW5"), aml_int(0x1), aml_name("CDW5"))); + aml_append(if_uuid, if_cxl); + + aml_append(if_uuid, aml_return(aml_arg(3))); + aml_append(method, if_uuid); + + /* + * If no UUID matched, return Unrecognized UUID via Arg3 DWord 1 + * ACPI 6.4 - 6.2.11 + * Unrecognised UUID - BIT(2) + */ + else_uuid = aml_else(); + + aml_append(else_uuid, + aml_or(aml_name("CDW1"), aml_int(0x4), aml_name("CDW1"))); + aml_append(else_uuid, aml_return(aml_arg(3))); + aml_append(method, else_uuid); + + return method; +} + +void build_cxl_osc_method(Aml *dev) +{ + aml_append(dev, aml_name_decl("SUPP", aml_int(0))); + aml_append(dev, aml_name_decl("CTRL", aml_int(0))); + aml_append(dev, aml_name_decl("SUPC", aml_int(0))); + aml_append(dev, aml_name_decl("CTRC", aml_int(0))); + aml_append(dev, __build_cxl_osc_method()); +} diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build index 8bea2e6933..cea2f5f93a 100644 --- a/hw/acpi/meson.build +++ b/hw/acpi/meson.build @@ -13,6 +13,7 @@ acpi_ss.add(when: 'CONFIG_ACPI_MEMORY_HOTPLUG', if_false: files('acpi-mem-hotplu acpi_ss.add(when: 'CONFIG_ACPI_NVDIMM', if_true: files('nvdimm.c')) acpi_ss.add(when: 'CONFIG_ACPI_NVDIMM', if_false: files('acpi-nvdimm-stub.c')) acpi_ss.add(when: 'CONFIG_ACPI_PCI', if_true: files('pci.c')) +acpi_ss.add(when: 'CONFIG_ACPI_CXL', if_true: files('cxl.c'), if_false: files('cxl-stub.c')) acpi_ss.add(when: 'CONFIG_ACPI_VMGENID', if_true: files('vmgenid.c')) acpi_ss.add(when: 'CONFIG_ACPI_HW_REDUCED', if_true: files('generic_event_device.c')) acpi_ss.add(when: 'CONFIG_ACPI_HMAT', if_true: files('hmat.c')) @@ -33,4 +34,5 @@ softmmu_ss.add_all(when: 'CONFIG_ACPI', if_true: acpi_ss) softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('acpi-stub.c', 'aml-build-stub.c', 'acpi-x86-stub.c', 'ipmi-stub.c', 'ghes-stub.c', 'acpi-mem-hotplug-stub.c', 'acpi-cpu-hotplug-stub.c', - 'acpi-pci-hotplug-stub.c', 'acpi-nvdimm-stub.c')) + 'acpi-pci-hotplug-stub.c', 'acpi-nvdimm-stub.c', + 'cxl-stub.c')) diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig index 97f3b38019..219262a8da 100644 --- a/hw/arm/Kconfig +++ b/hw/arm/Kconfig @@ -29,6 +29,7 @@ config ARM_VIRT select ACPI_APEI select ACPI_VIOT select VIRTIO_MEM_SUPPORTED + select ACPI_CXL config CHEETAH bool diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index 1a42ae9187..5dca4eab09 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -491,7 +491,7 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp) return; } - virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, + virtio_init(vdev, VIRTIO_ID_BLOCK, sizeof(struct virtio_blk_config)); s->virtqs = g_new(VirtQueue *, s->num_queues); @@ -569,6 +569,12 @@ static void vhost_user_blk_instance_init(Object *obj) "/disk@0,0", DEVICE(obj)); } +static struct vhost_dev *vhost_user_blk_get_vhost(VirtIODevice *vdev) +{ + VHostUserBlk *s = VHOST_USER_BLK(vdev); + return &s->dev; +} + static const VMStateDescription vmstate_vhost_user_blk = { .name = "vhost-user-blk", .minimum_version_id = 1, @@ -603,6 +609,7 @@ static void vhost_user_blk_class_init(ObjectClass *klass, void *data) vdc->get_features = vhost_user_blk_get_features; vdc->set_status = vhost_user_blk_set_status; vdc->reset = vhost_user_blk_reset; + vdc->get_vhost = vhost_user_blk_get_vhost; } static const TypeInfo vhost_user_blk_info = { diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 6a1cc41877..cd804795c6 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -1206,7 +1206,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) virtio_blk_set_config_size(s, s->host_features); - virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size); + virtio_init(vdev, VIRTIO_ID_BLOCK, s->config_size); s->blk = conf->conf.blk; s->rq = NULL; diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c index 6048d408b8..7d4601cb5d 100644 --- a/hw/char/virtio-serial-bus.c +++ b/hw/char/virtio-serial-bus.c @@ -1044,8 +1044,7 @@ static void virtio_serial_device_realize(DeviceState *dev, Error **errp) VIRTIO_CONSOLE_F_EMERG_WRITE)) { config_size = offsetof(struct virtio_console_config, emerg_wr); } - virtio_init(vdev, "virtio-serial", VIRTIO_ID_CONSOLE, - config_size); + virtio_init(vdev, VIRTIO_ID_CONSOLE, config_size); /* Spawn a new virtio-serial bus on which the ports will ride as devices */ qbus_init(&vser->bus, sizeof(vser->bus), TYPE_VIRTIO_SERIAL_BUS, diff --git a/hw/core/machine.c b/hw/core/machine.c index 3264c1e11d..b03d9192ba 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -33,6 +33,7 @@ #include "sysemu/qtest.h" #include "hw/pci/pci.h" #include "hw/mem/nvdimm.h" +#include "hw/cxl/cxl.h" #include "migration/global_state.h" #include "migration/vmstate.h" #include "exec/confidential-guest-support.h" @@ -625,6 +626,20 @@ static void machine_set_nvdimm_persistence(Object *obj, const char *value, nvdimms_state->persistence_string = g_strdup(value); } +static bool machine_get_cxl(Object *obj, Error **errp) +{ + MachineState *ms = MACHINE(obj); + + return ms->cxl_devices_state->is_enabled; +} + +static void machine_set_cxl(Object *obj, bool value, Error **errp) +{ + MachineState *ms = MACHINE(obj); + + ms->cxl_devices_state->is_enabled = value; +} + void machine_class_allow_dynamic_sysbus_dev(MachineClass *mc, const char *type) { QAPI_LIST_PREPEND(mc->allowed_dynamic_sysbus_devices, g_strdup(type)); @@ -911,6 +926,8 @@ static void machine_class_init(ObjectClass *oc, void *data) mc->default_ram_size = 128 * MiB; mc->rom_file_has_mr = true; + /* Few machines support CXL, so default to off */ + mc->cxl_supported = false; /* numa node memory size aligned on 8MB by default. * On Linux, each node's border has to be 8MB aligned */ @@ -1071,6 +1088,16 @@ static void machine_initfn(Object *obj) "Valid values are cpu, mem-ctrl"); } + if (mc->cxl_supported) { + Object *obj = OBJECT(ms); + + ms->cxl_devices_state = g_new0(CXLState, 1); + object_property_add_bool(obj, "cxl", machine_get_cxl, machine_set_cxl); + object_property_set_description(obj, "cxl", + "Set on/off to enable/disable " + "CXL instantiation"); + } + if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) { ms->numa_state = g_new0(NumaState, 1); object_property_add_bool(obj, "hmat", @@ -1108,6 +1135,7 @@ static void machine_finalize(Object *obj) g_free(ms->device_memory); g_free(ms->nvdimms_state); g_free(ms->numa_state); + g_free(ms->cxl_devices_state); } bool machine_usb(MachineState *machine) diff --git a/hw/cxl/Kconfig b/hw/cxl/Kconfig new file mode 100644 index 0000000000..8e67519b16 --- /dev/null +++ b/hw/cxl/Kconfig @@ -0,0 +1,3 @@ +config CXL + bool + default y if PCI_EXPRESS diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c new file mode 100644 index 0000000000..7985c9bfca --- /dev/null +++ b/hw/cxl/cxl-component-utils.c @@ -0,0 +1,396 @@ +/* + * CXL Utility library for components + * + * Copyright(C) 2020 Intel Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See the + * COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include "qapi/error.h" +#include "hw/pci/pci.h" +#include "hw/cxl/cxl.h" + +static uint64_t cxl_cache_mem_read_reg(void *opaque, hwaddr offset, + unsigned size) +{ + CXLComponentState *cxl_cstate = opaque; + ComponentRegisters *cregs = &cxl_cstate->crb; + + if (size == 8) { + qemu_log_mask(LOG_UNIMP, + "CXL 8 byte cache mem registers not implemented\n"); + return 0; + } + + if (cregs->special_ops && cregs->special_ops->read) { + return cregs->special_ops->read(cxl_cstate, offset, size); + } else { + return cregs->cache_mem_registers[offset / sizeof(*cregs->cache_mem_registers)]; + } +} + +static void dumb_hdm_handler(CXLComponentState *cxl_cstate, hwaddr offset, + uint32_t value) +{ + ComponentRegisters *cregs = &cxl_cstate->crb; + uint32_t *cache_mem = cregs->cache_mem_registers; + bool should_commit = false; + + switch (offset) { + case A_CXL_HDM_DECODER0_CTRL: + should_commit = FIELD_EX32(value, CXL_HDM_DECODER0_CTRL, COMMIT); + break; + default: + break; + } + + memory_region_transaction_begin(); + stl_le_p((uint8_t *)cache_mem + offset, value); + if (should_commit) { + ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, COMMIT, 0); + ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, ERR, 0); + ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, COMMITTED, 1); + } + memory_region_transaction_commit(); +} + +static void cxl_cache_mem_write_reg(void *opaque, hwaddr offset, uint64_t value, + unsigned size) +{ + CXLComponentState *cxl_cstate = opaque; + ComponentRegisters *cregs = &cxl_cstate->crb; + uint32_t mask; + + if (size == 8) { + qemu_log_mask(LOG_UNIMP, + "CXL 8 byte cache mem registers not implemented\n"); + return; + } + mask = cregs->cache_mem_regs_write_mask[offset / sizeof(*cregs->cache_mem_regs_write_mask)]; + value &= mask; + /* RO bits should remain constant. Done by reading existing value */ + value |= ~mask & cregs->cache_mem_registers[offset / sizeof(*cregs->cache_mem_registers)]; + if (cregs->special_ops && cregs->special_ops->write) { + cregs->special_ops->write(cxl_cstate, offset, value, size); + return; + } + + if (offset >= A_CXL_HDM_DECODER_CAPABILITY && + offset <= A_CXL_HDM_DECODER0_TARGET_LIST_HI) { + dumb_hdm_handler(cxl_cstate, offset, value); + } else { + cregs->cache_mem_registers[offset / sizeof(*cregs->cache_mem_registers)] = value; + } +} + +/* + * 8.2.3 + * The access restrictions specified in Section 8.2.2 also apply to CXL 2.0 + * Component Registers. + * + * 8.2.2 + * • A 32 bit register shall be accessed as a 4 Bytes quantity. Partial + * reads are not permitted. + * • A 64 bit register shall be accessed as a 8 Bytes quantity. Partial + * reads are not permitted. + * + * As of the spec defined today, only 4 byte registers exist. + */ +static const MemoryRegionOps cache_mem_ops = { + .read = cxl_cache_mem_read_reg, + .write = cxl_cache_mem_write_reg, + .endianness = DEVICE_LITTLE_ENDIAN, + .valid = { + .min_access_size = 4, + .max_access_size = 8, + .unaligned = false, + }, + .impl = { + .min_access_size = 4, + .max_access_size = 8, + }, +}; + +void cxl_component_register_block_init(Object *obj, + CXLComponentState *cxl_cstate, + const char *type) +{ + ComponentRegisters *cregs = &cxl_cstate->crb; + + memory_region_init(&cregs->component_registers, obj, type, + CXL2_COMPONENT_BLOCK_SIZE); + + /* io registers controls link which we don't care about in QEMU */ + memory_region_init_io(&cregs->io, obj, NULL, cregs, ".io", + CXL2_COMPONENT_IO_REGION_SIZE); + memory_region_init_io(&cregs->cache_mem, obj, &cache_mem_ops, cregs, + ".cache_mem", CXL2_COMPONENT_CM_REGION_SIZE); + + memory_region_add_subregion(&cregs->component_registers, 0, &cregs->io); + memory_region_add_subregion(&cregs->component_registers, + CXL2_COMPONENT_IO_REGION_SIZE, + &cregs->cache_mem); +} + +static void ras_init_common(uint32_t *reg_state, uint32_t *write_msk) +{ + /* + * Error status is RW1C but given bits are not yet set, it can + * be handled as RO. + */ + reg_state[R_CXL_RAS_UNC_ERR_STATUS] = 0; + /* Bits 12-13 and 17-31 reserved in CXL 2.0 */ + reg_state[R_CXL_RAS_UNC_ERR_MASK] = 0x1cfff; + write_msk[R_CXL_RAS_UNC_ERR_MASK] = 0x1cfff; + reg_state[R_CXL_RAS_UNC_ERR_SEVERITY] = 0x1cfff; + write_msk[R_CXL_RAS_UNC_ERR_SEVERITY] = 0x1cfff; + reg_state[R_CXL_RAS_COR_ERR_STATUS] = 0; + reg_state[R_CXL_RAS_COR_ERR_MASK] = 0x7f; + write_msk[R_CXL_RAS_COR_ERR_MASK] = 0x7f; + /* CXL switches and devices must set */ + reg_state[R_CXL_RAS_ERR_CAP_CTRL] = 0x00; +} + +static void hdm_init_common(uint32_t *reg_state, uint32_t *write_msk) +{ + int decoder_count = 1; + int i; + + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, DECODER_COUNT, + cxl_decoder_count_enc(decoder_count)); + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, TARGET_COUNT, 1); + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, INTERLEAVE_256B, 1); + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, INTERLEAVE_4K, 1); + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, POISON_ON_ERR_CAP, 0); + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_GLOBAL_CONTROL, + HDM_DECODER_ENABLE, 0); + write_msk[R_CXL_HDM_DECODER_GLOBAL_CONTROL] = 0x3; + for (i = 0; i < decoder_count; i++) { + write_msk[R_CXL_HDM_DECODER0_BASE_LO + i * 0x20] = 0xf0000000; + write_msk[R_CXL_HDM_DECODER0_BASE_HI + i * 0x20] = 0xffffffff; + write_msk[R_CXL_HDM_DECODER0_SIZE_LO + i * 0x20] = 0xf0000000; + write_msk[R_CXL_HDM_DECODER0_SIZE_HI + i * 0x20] = 0xffffffff; + write_msk[R_CXL_HDM_DECODER0_CTRL + i * 0x20] = 0x13ff; + } +} + +void cxl_component_register_init_common(uint32_t *reg_state, uint32_t *write_msk, + enum reg_type type) +{ + int caps = 0; + + /* + * In CXL 2.0 the capabilities required for each CXL component are such that, + * with the ordering chosen here, a single number can be used to define + * which capabilities should be provided. + */ + switch (type) { + case CXL2_DOWNSTREAM_PORT: + case CXL2_DEVICE: + /* RAS, Link */ + caps = 2; + break; + case CXL2_UPSTREAM_PORT: + case CXL2_TYPE3_DEVICE: + case CXL2_LOGICAL_DEVICE: + /* + HDM */ + caps = 3; + break; + case CXL2_ROOT_PORT: + /* + Extended Security, + Snoop */ + caps = 5; + break; + default: + abort(); + } + + memset(reg_state, 0, CXL2_COMPONENT_CM_REGION_SIZE); + + /* CXL Capability Header Register */ + ARRAY_FIELD_DP32(reg_state, CXL_CAPABILITY_HEADER, ID, 1); + ARRAY_FIELD_DP32(reg_state, CXL_CAPABILITY_HEADER, VERSION, 1); + ARRAY_FIELD_DP32(reg_state, CXL_CAPABILITY_HEADER, CACHE_MEM_VERSION, 1); + ARRAY_FIELD_DP32(reg_state, CXL_CAPABILITY_HEADER, ARRAY_SIZE, caps); + +#define init_cap_reg(reg, id, version) \ + QEMU_BUILD_BUG_ON(CXL_##reg##_REGISTERS_OFFSET == 0); \ + do { \ + int which = R_CXL_##reg##_CAPABILITY_HEADER; \ + reg_state[which] = FIELD_DP32(reg_state[which], \ + CXL_##reg##_CAPABILITY_HEADER, ID, id); \ + reg_state[which] = \ + FIELD_DP32(reg_state[which], CXL_##reg##_CAPABILITY_HEADER, \ + VERSION, version); \ + reg_state[which] = \ + FIELD_DP32(reg_state[which], CXL_##reg##_CAPABILITY_HEADER, PTR, \ + CXL_##reg##_REGISTERS_OFFSET); \ + } while (0) + + init_cap_reg(RAS, 2, 2); + ras_init_common(reg_state, write_msk); + + init_cap_reg(LINK, 4, 2); + + if (caps < 3) { + return; + } + + init_cap_reg(HDM, 5, 1); + hdm_init_common(reg_state, write_msk); + + if (caps < 5) { + return; + } + + init_cap_reg(EXTSEC, 6, 1); + init_cap_reg(SNOOP, 8, 1); + +#undef init_cap_reg +} + +/* + * Helper to creates a DVSEC header for a CXL entity. The caller is responsible + * for tracking the valid offset. + * + * This function will build the DVSEC header on behalf of the caller and then + * copy in the remaining data for the vendor specific bits. + * It will also set up appropriate write masks. + */ +void cxl_component_create_dvsec(CXLComponentState *cxl, + enum reg_type cxl_dev_type, uint16_t length, + uint16_t type, uint8_t rev, uint8_t *body) +{ + PCIDevice *pdev = cxl->pdev; + uint16_t offset = cxl->dvsec_offset; + uint8_t *wmask = pdev->wmask; + + assert(offset >= PCI_CFG_SPACE_SIZE && + ((offset + length) < PCI_CFG_SPACE_EXP_SIZE)); + assert((length & 0xf000) == 0); + assert((rev & ~0xf) == 0); + + /* Create the DVSEC in the MCFG space */ + pcie_add_capability(pdev, PCI_EXT_CAP_ID_DVSEC, 1, offset, length); + pci_set_long(pdev->config + offset + PCIE_DVSEC_HEADER1_OFFSET, + (length << 20) | (rev << 16) | CXL_VENDOR_ID); + pci_set_word(pdev->config + offset + PCIE_DVSEC_ID_OFFSET, type); + memcpy(pdev->config + offset + sizeof(DVSECHeader), + body + sizeof(DVSECHeader), + length - sizeof(DVSECHeader)); + + /* Configure write masks */ + switch (type) { + case PCIE_CXL_DEVICE_DVSEC: + /* Cntrl RW Lock - so needs explicit blocking when lock is set */ + wmask[offset + offsetof(CXLDVSECDevice, ctrl)] = 0xFD; + wmask[offset + offsetof(CXLDVSECDevice, ctrl) + 1] = 0x4F; + /* Status is RW1CS */ + wmask[offset + offsetof(CXLDVSECDevice, ctrl2)] = 0x0F; + /* Lock is RW Once */ + wmask[offset + offsetof(CXLDVSECDevice, lock)] = 0x01; + /* range1/2_base_high/low is RW Lock */ + wmask[offset + offsetof(CXLDVSECDevice, range1_base_hi)] = 0xFF; + wmask[offset + offsetof(CXLDVSECDevice, range1_base_hi) + 1] = 0xFF; + wmask[offset + offsetof(CXLDVSECDevice, range1_base_hi) + 2] = 0xFF; + wmask[offset + offsetof(CXLDVSECDevice, range1_base_hi) + 3] = 0xFF; + wmask[offset + offsetof(CXLDVSECDevice, range1_base_lo) + 3] = 0xF0; + wmask[offset + offsetof(CXLDVSECDevice, range2_base_hi)] = 0xFF; + wmask[offset + offsetof(CXLDVSECDevice, range2_base_hi) + 1] = 0xFF; + wmask[offset + offsetof(CXLDVSECDevice, range2_base_hi) + 2] = 0xFF; + wmask[offset + offsetof(CXLDVSECDevice, range2_base_hi) + 3] = 0xFF; + wmask[offset + offsetof(CXLDVSECDevice, range2_base_lo) + 3] = 0xF0; + break; + case NON_CXL_FUNCTION_MAP_DVSEC: + break; /* Not yet implemented */ + case EXTENSIONS_PORT_DVSEC: + wmask[offset + offsetof(CXLDVSECPortExtensions, control)] = 0x0F; + wmask[offset + offsetof(CXLDVSECPortExtensions, control) + 1] = 0x40; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_bus_base)] = 0xFF; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_bus_limit)] = 0xFF; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_memory_base)] = 0xF0; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_memory_base) + 1] = 0xFF; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_memory_limit)] = 0xF0; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_memory_limit) + 1] = 0xFF; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_prefetch_base)] = 0xF0; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_prefetch_base) + 1] = 0xFF; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_prefetch_limit)] = 0xF0; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_prefetch_limit) + 1] = 0xFF; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_prefetch_base_high)] = 0xFF; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_prefetch_base_high) + 1] = 0xFF; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_prefetch_base_high) + 2] = 0xFF; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_prefetch_base_high) + 3] = 0xFF; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_prefetch_limit_high)] = 0xFF; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_prefetch_limit_high) + 1] = 0xFF; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_prefetch_limit_high) + 2] = 0xFF; + wmask[offset + offsetof(CXLDVSECPortExtensions, alt_prefetch_limit_high) + 3] = 0xFF; + break; + case GPF_PORT_DVSEC: + wmask[offset + offsetof(CXLDVSECPortGPF, phase1_ctrl)] = 0x0F; + wmask[offset + offsetof(CXLDVSECPortGPF, phase1_ctrl) + 1] = 0x0F; + wmask[offset + offsetof(CXLDVSECPortGPF, phase2_ctrl)] = 0x0F; + wmask[offset + offsetof(CXLDVSECPortGPF, phase2_ctrl) + 1] = 0x0F; + break; + case GPF_DEVICE_DVSEC: + wmask[offset + offsetof(CXLDVSECDeviceGPF, phase2_duration)] = 0x0F; + wmask[offset + offsetof(CXLDVSECDeviceGPF, phase2_duration) + 1] = 0x0F; + wmask[offset + offsetof(CXLDVSECDeviceGPF, phase2_power)] = 0xFF; + wmask[offset + offsetof(CXLDVSECDeviceGPF, phase2_power) + 1] = 0xFF; + wmask[offset + offsetof(CXLDVSECDeviceGPF, phase2_power) + 2] = 0xFF; + wmask[offset + offsetof(CXLDVSECDeviceGPF, phase2_power) + 3] = 0xFF; + break; + case PCIE_FLEXBUS_PORT_DVSEC: + switch (cxl_dev_type) { + case CXL2_ROOT_PORT: + /* No MLD */ + wmask[offset + offsetof(CXLDVSECPortFlexBus, ctrl)] = 0xbd; + break; + case CXL2_DOWNSTREAM_PORT: + wmask[offset + offsetof(CXLDVSECPortFlexBus, ctrl)] = 0xfd; + break; + default: /* Registers are RO for other component types */ + break; + } + /* There are rw1cs bits in the status register but never set currently */ + break; + } + + /* Update state for future DVSEC additions */ + range_init_nofail(&cxl->dvsecs[type], cxl->dvsec_offset, length); + cxl->dvsec_offset += length; +} + +uint8_t cxl_interleave_ways_enc(int iw, Error **errp) +{ + switch (iw) { + case 1: return 0x0; + case 2: return 0x1; + case 4: return 0x2; + case 8: return 0x3; + case 16: return 0x4; + case 3: return 0x8; + case 6: return 0x9; + case 12: return 0xa; + default: + error_setg(errp, "Interleave ways: %d not supported", iw); + return 0; + } +} + +uint8_t cxl_interleave_granularity_enc(uint64_t gran, Error **errp) +{ + switch (gran) { + case 256: return 0; + case 512: return 1; + case 1024: return 2; + case 2048: return 3; + case 4096: return 4; + case 8192: return 5; + case 16384: return 6; + default: + error_setg(errp, "Interleave granularity: %" PRIu64 " invalid", gran); + return 0; + } +} diff --git a/hw/cxl/cxl-device-utils.c b/hw/cxl/cxl-device-utils.c new file mode 100644 index 0000000000..687759b301 --- /dev/null +++ b/hw/cxl/cxl-device-utils.c @@ -0,0 +1,265 @@ +/* + * CXL Utility library for devices + * + * Copyright(C) 2020 Intel Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See the + * COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include "hw/cxl/cxl.h" + +/* + * Device registers have no restrictions per the spec, and so fall back to the + * default memory mapped register rules in 8.2: + * Software shall use CXL.io Memory Read and Write to access memory mapped + * register defined in this section. Unless otherwise specified, software + * shall restrict the accesses width based on the following: + * • A 32 bit register shall be accessed as a 1 Byte, 2 Bytes or 4 Bytes + * quantity. + * • A 64 bit register shall be accessed as a 1 Byte, 2 Bytes, 4 Bytes or 8 + * Bytes + * • The address shall be a multiple of the access width, e.g. when + * accessing a register as a 4 Byte quantity, the address shall be + * multiple of 4. + * • The accesses shall map to contiguous bytes.If these rules are not + * followed, the behavior is undefined + */ + +static uint64_t caps_reg_read(void *opaque, hwaddr offset, unsigned size) +{ + CXLDeviceState *cxl_dstate = opaque; + + if (size == 4) { + return cxl_dstate->caps_reg_state32[offset / sizeof(*cxl_dstate->caps_reg_state32)]; + } else { + return cxl_dstate->caps_reg_state64[offset / sizeof(*cxl_dstate->caps_reg_state64)]; + } +} + +static uint64_t dev_reg_read(void *opaque, hwaddr offset, unsigned size) +{ + return 0; +} + +static uint64_t mailbox_reg_read(void *opaque, hwaddr offset, unsigned size) +{ + CXLDeviceState *cxl_dstate = opaque; + + switch (size) { + case 1: + return cxl_dstate->mbox_reg_state[offset]; + case 2: + return cxl_dstate->mbox_reg_state16[offset / size]; + case 4: + return cxl_dstate->mbox_reg_state32[offset / size]; + case 8: + return cxl_dstate->mbox_reg_state64[offset / size]; + default: + g_assert_not_reached(); + } +} + +static void mailbox_mem_writel(uint32_t *reg_state, hwaddr offset, + uint64_t value) +{ + switch (offset) { + case A_CXL_DEV_MAILBOX_CTRL: + /* fallthrough */ + case A_CXL_DEV_MAILBOX_CAP: + /* RO register */ + break; + default: + qemu_log_mask(LOG_UNIMP, + "%s Unexpected 32-bit access to 0x%" PRIx64 " (WI)\n", + __func__, offset); + return; + } + + reg_state[offset / sizeof(*reg_state)] = value; +} + +static void mailbox_mem_writeq(uint64_t *reg_state, hwaddr offset, + uint64_t value) +{ + switch (offset) { + case A_CXL_DEV_MAILBOX_CMD: + break; + case A_CXL_DEV_BG_CMD_STS: + /* BG not supported */ + /* fallthrough */ + case A_CXL_DEV_MAILBOX_STS: + /* Read only register, will get updated by the state machine */ + return; + default: + qemu_log_mask(LOG_UNIMP, + "%s Unexpected 64-bit access to 0x%" PRIx64 " (WI)\n", + __func__, offset); + return; + } + + + reg_state[offset / sizeof(*reg_state)] = value; +} + +static void mailbox_reg_write(void *opaque, hwaddr offset, uint64_t value, + unsigned size) +{ + CXLDeviceState *cxl_dstate = opaque; + + if (offset >= A_CXL_DEV_CMD_PAYLOAD) { + memcpy(cxl_dstate->mbox_reg_state + offset, &value, size); + return; + } + + switch (size) { + case 4: + mailbox_mem_writel(cxl_dstate->mbox_reg_state32, offset, value); + break; + case 8: + mailbox_mem_writeq(cxl_dstate->mbox_reg_state64, offset, value); + break; + default: + g_assert_not_reached(); + } + + if (ARRAY_FIELD_EX32(cxl_dstate->mbox_reg_state32, CXL_DEV_MAILBOX_CTRL, + DOORBELL)) { + cxl_process_mailbox(cxl_dstate); + } +} + +static uint64_t mdev_reg_read(void *opaque, hwaddr offset, unsigned size) +{ + uint64_t retval = 0; + + retval = FIELD_DP64(retval, CXL_MEM_DEV_STS, MEDIA_STATUS, 1); + retval = FIELD_DP64(retval, CXL_MEM_DEV_STS, MBOX_READY, 1); + + return retval; +} + +static const MemoryRegionOps mdev_ops = { + .read = mdev_reg_read, + .write = NULL, /* memory device register is read only */ + .endianness = DEVICE_LITTLE_ENDIAN, + .valid = { + .min_access_size = 1, + .max_access_size = 8, + .unaligned = false, + }, + .impl = { + .min_access_size = 8, + .max_access_size = 8, + }, +}; + +static const MemoryRegionOps mailbox_ops = { + .read = mailbox_reg_read, + .write = mailbox_reg_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .valid = { + .min_access_size = 1, + .max_access_size = 8, + .unaligned = false, + }, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + }, +}; + +static const MemoryRegionOps dev_ops = { + .read = dev_reg_read, + .write = NULL, /* status register is read only */ + .endianness = DEVICE_LITTLE_ENDIAN, + .valid = { + .min_access_size = 1, + .max_access_size = 8, + .unaligned = false, + }, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + }, +}; + +static const MemoryRegionOps caps_ops = { + .read = caps_reg_read, + .write = NULL, /* caps registers are read only */ + .endianness = DEVICE_LITTLE_ENDIAN, + .valid = { + .min_access_size = 1, + .max_access_size = 8, + .unaligned = false, + }, + .impl = { + .min_access_size = 4, + .max_access_size = 8, + }, +}; + +void cxl_device_register_block_init(Object *obj, CXLDeviceState *cxl_dstate) +{ + /* This will be a BAR, so needs to be rounded up to pow2 for PCI spec */ + memory_region_init(&cxl_dstate->device_registers, obj, "device-registers", + pow2ceil(CXL_MMIO_SIZE)); + + memory_region_init_io(&cxl_dstate->caps, obj, &caps_ops, cxl_dstate, + "cap-array", CXL_CAPS_SIZE); + memory_region_init_io(&cxl_dstate->device, obj, &dev_ops, cxl_dstate, + "device-status", CXL_DEVICE_STATUS_REGISTERS_LENGTH); + memory_region_init_io(&cxl_dstate->mailbox, obj, &mailbox_ops, cxl_dstate, + "mailbox", CXL_MAILBOX_REGISTERS_LENGTH); + memory_region_init_io(&cxl_dstate->memory_device, obj, &mdev_ops, + cxl_dstate, "memory device caps", + CXL_MEMORY_DEVICE_REGISTERS_LENGTH); + + memory_region_add_subregion(&cxl_dstate->device_registers, 0, + &cxl_dstate->caps); + memory_region_add_subregion(&cxl_dstate->device_registers, + CXL_DEVICE_STATUS_REGISTERS_OFFSET, + &cxl_dstate->device); + memory_region_add_subregion(&cxl_dstate->device_registers, + CXL_MAILBOX_REGISTERS_OFFSET, + &cxl_dstate->mailbox); + memory_region_add_subregion(&cxl_dstate->device_registers, + CXL_MEMORY_DEVICE_REGISTERS_OFFSET, + &cxl_dstate->memory_device); +} + +static void device_reg_init_common(CXLDeviceState *cxl_dstate) { } + +static void mailbox_reg_init_common(CXLDeviceState *cxl_dstate) +{ + /* 2048 payload size, with no interrupt or background support */ + ARRAY_FIELD_DP32(cxl_dstate->mbox_reg_state32, CXL_DEV_MAILBOX_CAP, + PAYLOAD_SIZE, CXL_MAILBOX_PAYLOAD_SHIFT); + cxl_dstate->payload_size = CXL_MAILBOX_MAX_PAYLOAD_SIZE; +} + +static void memdev_reg_init_common(CXLDeviceState *cxl_dstate) { } + +void cxl_device_register_init_common(CXLDeviceState *cxl_dstate) +{ + uint64_t *cap_hdrs = cxl_dstate->caps_reg_state64; + const int cap_count = 3; + + /* CXL Device Capabilities Array Register */ + ARRAY_FIELD_DP64(cap_hdrs, CXL_DEV_CAP_ARRAY, CAP_ID, 0); + ARRAY_FIELD_DP64(cap_hdrs, CXL_DEV_CAP_ARRAY, CAP_VERSION, 1); + ARRAY_FIELD_DP64(cap_hdrs, CXL_DEV_CAP_ARRAY, CAP_COUNT, cap_count); + + cxl_device_cap_init(cxl_dstate, DEVICE_STATUS, 1); + device_reg_init_common(cxl_dstate); + + cxl_device_cap_init(cxl_dstate, MAILBOX, 2); + mailbox_reg_init_common(cxl_dstate); + + cxl_device_cap_init(cxl_dstate, MEMORY_DEVICE, 0x4000); + memdev_reg_init_common(cxl_dstate); + + assert(cxl_initialize_mailbox(cxl_dstate) == 0); +} diff --git a/hw/cxl/cxl-host-stubs.c b/hw/cxl/cxl-host-stubs.c new file mode 100644 index 0000000000..24465a52ab --- /dev/null +++ b/hw/cxl/cxl-host-stubs.c @@ -0,0 +1,16 @@ +/* + * CXL host parameter parsing routine stubs + * + * Copyright (c) 2022 Huawei + */ +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "hw/cxl/cxl.h" + +void cxl_fixed_memory_window_config(MachineState *ms, + CXLFixedMemoryWindowOptions *object, + Error **errp) {}; + +void cxl_fixed_memory_window_link_targets(Error **errp) {}; + +const MemoryRegionOps cfmws_ops; diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c new file mode 100644 index 0000000000..469b3c4ced --- /dev/null +++ b/hw/cxl/cxl-host.c @@ -0,0 +1,222 @@ +/* + * CXL host parameter parsing routines + * + * Copyright (c) 2022 Huawei + * Modeled loosely on the NUMA options handling in hw/core/numa.c + */ + +#include "qemu/osdep.h" +#include "qemu/units.h" +#include "qemu/bitmap.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "sysemu/qtest.h" +#include "hw/boards.h" + +#include "qapi/qapi-visit-machine.h" +#include "hw/cxl/cxl.h" +#include "hw/pci/pci_bus.h" +#include "hw/pci/pci_bridge.h" +#include "hw/pci/pci_host.h" +#include "hw/pci/pcie_port.h" + +void cxl_fixed_memory_window_config(MachineState *ms, + CXLFixedMemoryWindowOptions *object, + Error **errp) +{ + CXLFixedWindow *fw = g_malloc0(sizeof(*fw)); + strList *target; + int i; + + for (target = object->targets; target; target = target->next) { + fw->num_targets++; + } + + fw->enc_int_ways = cxl_interleave_ways_enc(fw->num_targets, errp); + if (*errp) { + return; + } + + fw->targets = g_malloc0_n(fw->num_targets, sizeof(*fw->targets)); + for (i = 0, target = object->targets; target; i++, target = target->next) { + /* This link cannot be resolved yet, so stash the name for now */ + fw->targets[i] = g_strdup(target->value); + } + + if (object->size % (256 * MiB)) { + error_setg(errp, + "Size of a CXL fixed memory window must my a multiple of 256MiB"); + return; + } + fw->size = object->size; + + if (object->has_interleave_granularity) { + fw->enc_int_gran = + cxl_interleave_granularity_enc(object->interleave_granularity, + errp); + if (*errp) { + return; + } + } else { + /* Default to 256 byte interleave */ + fw->enc_int_gran = 0; + } + + ms->cxl_devices_state->fixed_windows = + g_list_append(ms->cxl_devices_state->fixed_windows, fw); + + return; +} + +void cxl_fixed_memory_window_link_targets(Error **errp) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + + if (ms->cxl_devices_state && ms->cxl_devices_state->fixed_windows) { + GList *it; + + for (it = ms->cxl_devices_state->fixed_windows; it; it = it->next) { + CXLFixedWindow *fw = it->data; + int i; + + for (i = 0; i < fw->num_targets; i++) { + Object *o; + bool ambig; + + o = object_resolve_path_type(fw->targets[i], + TYPE_PXB_CXL_DEVICE, + &ambig); + if (!o) { + error_setg(errp, "Could not resolve CXLFM target %s", + fw->targets[i]); + return; + } + fw->target_hbs[i] = PXB_CXL_DEV(o); + } + } + } +} + +/* TODO: support, multiple hdm decoders */ +static bool cxl_hdm_find_target(uint32_t *cache_mem, hwaddr addr, + uint8_t *target) +{ + uint32_t ctrl; + uint32_t ig_enc; + uint32_t iw_enc; + uint32_t target_reg; + uint32_t target_idx; + + ctrl = cache_mem[R_CXL_HDM_DECODER0_CTRL]; + if (!FIELD_EX32(ctrl, CXL_HDM_DECODER0_CTRL, COMMITTED)) { + return false; + } + + ig_enc = FIELD_EX32(ctrl, CXL_HDM_DECODER0_CTRL, IG); + iw_enc = FIELD_EX32(ctrl, CXL_HDM_DECODER0_CTRL, IW); + target_idx = (addr / cxl_decode_ig(ig_enc)) % (1 << iw_enc); + + if (target_idx > 4) { + target_reg = cache_mem[R_CXL_HDM_DECODER0_TARGET_LIST_LO]; + target_reg >>= target_idx * 8; + } else { + target_reg = cache_mem[R_CXL_HDM_DECODER0_TARGET_LIST_LO]; + target_reg >>= (target_idx - 4) * 8; + } + *target = target_reg & 0xff; + + return true; +} + +static PCIDevice *cxl_cfmws_find_device(CXLFixedWindow *fw, hwaddr addr) +{ + CXLComponentState *hb_cstate; + PCIHostState *hb; + int rb_index; + uint32_t *cache_mem; + uint8_t target; + bool target_found; + PCIDevice *rp, *d; + + /* Address is relative to memory region. Convert to HPA */ + addr += fw->base; + + rb_index = (addr / cxl_decode_ig(fw->enc_int_gran)) % fw->num_targets; + hb = PCI_HOST_BRIDGE(fw->target_hbs[rb_index]->cxl.cxl_host_bridge); + if (!hb || !hb->bus || !pci_bus_is_cxl(hb->bus)) { + return NULL; + } + + hb_cstate = cxl_get_hb_cstate(hb); + if (!hb_cstate) { + return NULL; + } + + cache_mem = hb_cstate->crb.cache_mem_registers; + + target_found = cxl_hdm_find_target(cache_mem, addr, &target); + if (!target_found) { + return NULL; + } + + rp = pcie_find_port_by_pn(hb->bus, target); + if (!rp) { + return NULL; + } + + d = pci_bridge_get_sec_bus(PCI_BRIDGE(rp))->devices[0]; + + if (!d || !object_dynamic_cast(OBJECT(d), TYPE_CXL_TYPE3)) { + return NULL; + } + + return d; +} + +static MemTxResult cxl_read_cfmws(void *opaque, hwaddr addr, uint64_t *data, + unsigned size, MemTxAttrs attrs) +{ + CXLFixedWindow *fw = opaque; + PCIDevice *d; + + d = cxl_cfmws_find_device(fw, addr); + if (d == NULL) { + *data = 0; + /* Reads to invalid address return poison */ + return MEMTX_ERROR; + } + + return cxl_type3_read(d, addr + fw->base, data, size, attrs); +} + +static MemTxResult cxl_write_cfmws(void *opaque, hwaddr addr, + uint64_t data, unsigned size, + MemTxAttrs attrs) +{ + CXLFixedWindow *fw = opaque; + PCIDevice *d; + + d = cxl_cfmws_find_device(fw, addr); + if (d == NULL) { + /* Writes to invalid address are silent */ + return MEMTX_OK; + } + + return cxl_type3_write(d, addr + fw->base, data, size, attrs); +} + +const MemoryRegionOps cfmws_ops = { + .read_with_attrs = cxl_read_cfmws, + .write_with_attrs = cxl_write_cfmws, + .endianness = DEVICE_LITTLE_ENDIAN, + .valid = { + .min_access_size = 1, + .max_access_size = 8, + .unaligned = true, + }, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + .unaligned = true, + }, +}; diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c new file mode 100644 index 0000000000..bb66c765a5 --- /dev/null +++ b/hw/cxl/cxl-mailbox-utils.c @@ -0,0 +1,478 @@ +/* + * CXL Utility library for mailbox interface + * + * Copyright(C) 2020 Intel Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See the + * COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "hw/cxl/cxl.h" +#include "hw/pci/pci.h" +#include "qemu/cutils.h" +#include "qemu/log.h" +#include "qemu/uuid.h" + +/* + * How to add a new command, example. The command set FOO, with cmd BAR. + * 1. Add the command set and cmd to the enum. + * FOO = 0x7f, + * #define BAR 0 + * 2. Implement the handler + * static ret_code cmd_foo_bar(struct cxl_cmd *cmd, + * CXLDeviceState *cxl_dstate, uint16_t *len) + * 3. Add the command to the cxl_cmd_set[][] + * [FOO][BAR] = { "FOO_BAR", cmd_foo_bar, x, y }, + * 4. Implement your handler + * define_mailbox_handler(FOO_BAR) { ... return CXL_MBOX_SUCCESS; } + * + * + * Writing the handler: + * The handler will provide the &struct cxl_cmd, the &CXLDeviceState, and the + * in/out length of the payload. The handler is responsible for consuming the + * payload from cmd->payload and operating upon it as necessary. It must then + * fill the output data into cmd->payload (overwriting what was there), + * setting the length, and returning a valid return code. + * + * XXX: The handler need not worry about endianess. The payload is read out of + * a register interface that already deals with it. + */ + +enum { + EVENTS = 0x01, + #define GET_RECORDS 0x0 + #define CLEAR_RECORDS 0x1 + #define GET_INTERRUPT_POLICY 0x2 + #define SET_INTERRUPT_POLICY 0x3 + FIRMWARE_UPDATE = 0x02, + #define GET_INFO 0x0 + TIMESTAMP = 0x03, + #define GET 0x0 + #define SET 0x1 + LOGS = 0x04, + #define GET_SUPPORTED 0x0 + #define GET_LOG 0x1 + IDENTIFY = 0x40, + #define MEMORY_DEVICE 0x0 + CCLS = 0x41, + #define GET_PARTITION_INFO 0x0 + #define GET_LSA 0x2 + #define SET_LSA 0x3 +}; + +/* 8.2.8.4.5.1 Command Return Codes */ +typedef enum { + CXL_MBOX_SUCCESS = 0x0, + CXL_MBOX_BG_STARTED = 0x1, + CXL_MBOX_INVALID_INPUT = 0x2, + CXL_MBOX_UNSUPPORTED = 0x3, + CXL_MBOX_INTERNAL_ERROR = 0x4, + CXL_MBOX_RETRY_REQUIRED = 0x5, + CXL_MBOX_BUSY = 0x6, + CXL_MBOX_MEDIA_DISABLED = 0x7, + CXL_MBOX_FW_XFER_IN_PROGRESS = 0x8, + CXL_MBOX_FW_XFER_OUT_OF_ORDER = 0x9, + CXL_MBOX_FW_AUTH_FAILED = 0xa, + CXL_MBOX_FW_INVALID_SLOT = 0xb, + CXL_MBOX_FW_ROLLEDBACK = 0xc, + CXL_MBOX_FW_REST_REQD = 0xd, + CXL_MBOX_INVALID_HANDLE = 0xe, + CXL_MBOX_INVALID_PA = 0xf, + CXL_MBOX_INJECT_POISON_LIMIT = 0x10, + CXL_MBOX_PERMANENT_MEDIA_FAILURE = 0x11, + CXL_MBOX_ABORTED = 0x12, + CXL_MBOX_INVALID_SECURITY_STATE = 0x13, + CXL_MBOX_INCORRECT_PASSPHRASE = 0x14, + CXL_MBOX_UNSUPPORTED_MAILBOX = 0x15, + CXL_MBOX_INVALID_PAYLOAD_LENGTH = 0x16, + CXL_MBOX_MAX = 0x17 +} ret_code; + +struct cxl_cmd; +typedef ret_code (*opcode_handler)(struct cxl_cmd *cmd, + CXLDeviceState *cxl_dstate, uint16_t *len); +struct cxl_cmd { + const char *name; + opcode_handler handler; + ssize_t in; + uint16_t effect; /* Reported in CEL */ + uint8_t *payload; +}; + +#define DEFINE_MAILBOX_HANDLER_ZEROED(name, size) \ + uint16_t __zero##name = size; \ + static ret_code cmd_##name(struct cxl_cmd *cmd, \ + CXLDeviceState *cxl_dstate, uint16_t *len) \ + { \ + *len = __zero##name; \ + memset(cmd->payload, 0, *len); \ + return CXL_MBOX_SUCCESS; \ + } +#define DEFINE_MAILBOX_HANDLER_NOP(name) \ + static ret_code cmd_##name(struct cxl_cmd *cmd, \ + CXLDeviceState *cxl_dstate, uint16_t *len) \ + { \ + return CXL_MBOX_SUCCESS; \ + } + +DEFINE_MAILBOX_HANDLER_ZEROED(events_get_records, 0x20); +DEFINE_MAILBOX_HANDLER_NOP(events_clear_records); +DEFINE_MAILBOX_HANDLER_ZEROED(events_get_interrupt_policy, 4); +DEFINE_MAILBOX_HANDLER_NOP(events_set_interrupt_policy); + +/* 8.2.9.2.1 */ +static ret_code cmd_firmware_update_get_info(struct cxl_cmd *cmd, + CXLDeviceState *cxl_dstate, + uint16_t *len) +{ + struct { + uint8_t slots_supported; + uint8_t slot_info; + uint8_t caps; + uint8_t rsvd[0xd]; + char fw_rev1[0x10]; + char fw_rev2[0x10]; + char fw_rev3[0x10]; + char fw_rev4[0x10]; + } QEMU_PACKED *fw_info; + QEMU_BUILD_BUG_ON(sizeof(*fw_info) != 0x50); + + if (cxl_dstate->pmem_size < (256 << 20)) { + return CXL_MBOX_INTERNAL_ERROR; + } + + fw_info = (void *)cmd->payload; + memset(fw_info, 0, sizeof(*fw_info)); + + fw_info->slots_supported = 2; + fw_info->slot_info = BIT(0) | BIT(3); + fw_info->caps = 0; + pstrcpy(fw_info->fw_rev1, sizeof(fw_info->fw_rev1), "BWFW VERSION 0"); + + *len = sizeof(*fw_info); + return CXL_MBOX_SUCCESS; +} + +/* 8.2.9.3.1 */ +static ret_code cmd_timestamp_get(struct cxl_cmd *cmd, + CXLDeviceState *cxl_dstate, + uint16_t *len) +{ + uint64_t time, delta; + uint64_t final_time = 0; + + if (cxl_dstate->timestamp.set) { + /* First find the delta from the last time the host set the time. */ + time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); + delta = time - cxl_dstate->timestamp.last_set; + final_time = cxl_dstate->timestamp.host_set + delta; + } + + /* Then adjust the actual time */ + stq_le_p(cmd->payload, final_time); + *len = 8; + + return CXL_MBOX_SUCCESS; +} + +/* 8.2.9.3.2 */ +static ret_code cmd_timestamp_set(struct cxl_cmd *cmd, + CXLDeviceState *cxl_dstate, + uint16_t *len) +{ + cxl_dstate->timestamp.set = true; + cxl_dstate->timestamp.last_set = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); + + cxl_dstate->timestamp.host_set = le64_to_cpu(*(uint64_t *)cmd->payload); + + *len = 0; + return CXL_MBOX_SUCCESS; +} + +static QemuUUID cel_uuid; + +/* 8.2.9.4.1 */ +static ret_code cmd_logs_get_supported(struct cxl_cmd *cmd, + CXLDeviceState *cxl_dstate, + uint16_t *len) +{ + struct { + uint16_t entries; + uint8_t rsvd[6]; + struct { + QemuUUID uuid; + uint32_t size; + } log_entries[1]; + } QEMU_PACKED *supported_logs = (void *)cmd->payload; + QEMU_BUILD_BUG_ON(sizeof(*supported_logs) != 0x1c); + + supported_logs->entries = 1; + supported_logs->log_entries[0].uuid = cel_uuid; + supported_logs->log_entries[0].size = 4 * cxl_dstate->cel_size; + + *len = sizeof(*supported_logs); + return CXL_MBOX_SUCCESS; +} + +/* 8.2.9.4.2 */ +static ret_code cmd_logs_get_log(struct cxl_cmd *cmd, + CXLDeviceState *cxl_dstate, + uint16_t *len) +{ + struct { + QemuUUID uuid; + uint32_t offset; + uint32_t length; + } QEMU_PACKED QEMU_ALIGNED(16) *get_log = (void *)cmd->payload; + + /* + * 8.2.9.4.2 + * The device shall return Invalid Parameter if the Offset or Length + * fields attempt to access beyond the size of the log as reported by Get + * Supported Logs. + * + * XXX: Spec is wrong, "Invalid Parameter" isn't a thing. + * XXX: Spec doesn't address incorrect UUID incorrectness. + * + * The CEL buffer is large enough to fit all commands in the emulation, so + * the only possible failure would be if the mailbox itself isn't big + * enough. + */ + if (get_log->offset + get_log->length > cxl_dstate->payload_size) { + return CXL_MBOX_INVALID_INPUT; + } + + if (!qemu_uuid_is_equal(&get_log->uuid, &cel_uuid)) { + return CXL_MBOX_UNSUPPORTED; + } + + /* Store off everything to local variables so we can wipe out the payload */ + *len = get_log->length; + + memmove(cmd->payload, cxl_dstate->cel_log + get_log->offset, + get_log->length); + + return CXL_MBOX_SUCCESS; +} + +/* 8.2.9.5.1.1 */ +static ret_code cmd_identify_memory_device(struct cxl_cmd *cmd, + CXLDeviceState *cxl_dstate, + uint16_t *len) +{ + struct { + char fw_revision[0x10]; + uint64_t total_capacity; + uint64_t volatile_capacity; + uint64_t persistent_capacity; + uint64_t partition_align; + uint16_t info_event_log_size; + uint16_t warning_event_log_size; + uint16_t failure_event_log_size; + uint16_t fatal_event_log_size; + uint32_t lsa_size; + uint8_t poison_list_max_mer[3]; + uint16_t inject_poison_limit; + uint8_t poison_caps; + uint8_t qos_telemetry_caps; + } QEMU_PACKED *id; + QEMU_BUILD_BUG_ON(sizeof(*id) != 0x43); + + CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate); + CXLType3Class *cvc = CXL_TYPE3_GET_CLASS(ct3d); + uint64_t size = cxl_dstate->pmem_size; + + if (!QEMU_IS_ALIGNED(size, 256 << 20)) { + return CXL_MBOX_INTERNAL_ERROR; + } + + id = (void *)cmd->payload; + memset(id, 0, sizeof(*id)); + + /* PMEM only */ + snprintf(id->fw_revision, 0x10, "BWFW VERSION %02d", 0); + + id->total_capacity = size / (256 << 20); + id->persistent_capacity = size / (256 << 20); + id->lsa_size = cvc->get_lsa_size(ct3d); + + *len = sizeof(*id); + return CXL_MBOX_SUCCESS; +} + +static ret_code cmd_ccls_get_partition_info(struct cxl_cmd *cmd, + CXLDeviceState *cxl_dstate, + uint16_t *len) +{ + struct { + uint64_t active_vmem; + uint64_t active_pmem; + uint64_t next_vmem; + uint64_t next_pmem; + } QEMU_PACKED *part_info = (void *)cmd->payload; + QEMU_BUILD_BUG_ON(sizeof(*part_info) != 0x20); + uint64_t size = cxl_dstate->pmem_size; + + if (!QEMU_IS_ALIGNED(size, 256 << 20)) { + return CXL_MBOX_INTERNAL_ERROR; + } + + /* PMEM only */ + part_info->active_vmem = 0; + part_info->next_vmem = 0; + part_info->active_pmem = size / (256 << 20); + part_info->next_pmem = 0; + + *len = sizeof(*part_info); + return CXL_MBOX_SUCCESS; +} + +static ret_code cmd_ccls_get_lsa(struct cxl_cmd *cmd, + CXLDeviceState *cxl_dstate, + uint16_t *len) +{ + struct { + uint32_t offset; + uint32_t length; + } QEMU_PACKED *get_lsa; + CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate); + CXLType3Class *cvc = CXL_TYPE3_GET_CLASS(ct3d); + uint32_t offset, length; + + get_lsa = (void *)cmd->payload; + offset = get_lsa->offset; + length = get_lsa->length; + + if (offset + length > cvc->get_lsa_size(ct3d)) { + *len = 0; + return CXL_MBOX_INVALID_INPUT; + } + + *len = cvc->get_lsa(ct3d, get_lsa, length, offset); + return CXL_MBOX_SUCCESS; +} + +static ret_code cmd_ccls_set_lsa(struct cxl_cmd *cmd, + CXLDeviceState *cxl_dstate, + uint16_t *len) +{ + struct set_lsa_pl { + uint32_t offset; + uint32_t rsvd; + uint8_t data[]; + } QEMU_PACKED; + struct set_lsa_pl *set_lsa_payload = (void *)cmd->payload; + CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate); + CXLType3Class *cvc = CXL_TYPE3_GET_CLASS(ct3d); + const size_t hdr_len = offsetof(struct set_lsa_pl, data); + uint16_t plen = *len; + + *len = 0; + if (!plen) { + return CXL_MBOX_SUCCESS; + } + + if (set_lsa_payload->offset + plen > cvc->get_lsa_size(ct3d) + hdr_len) { + return CXL_MBOX_INVALID_INPUT; + } + plen -= hdr_len; + + cvc->set_lsa(ct3d, set_lsa_payload->data, plen, set_lsa_payload->offset); + return CXL_MBOX_SUCCESS; +} + +#define IMMEDIATE_CONFIG_CHANGE (1 << 1) +#define IMMEDIATE_DATA_CHANGE (1 << 2) +#define IMMEDIATE_POLICY_CHANGE (1 << 3) +#define IMMEDIATE_LOG_CHANGE (1 << 4) + +static struct cxl_cmd cxl_cmd_set[256][256] = { + [EVENTS][GET_RECORDS] = { "EVENTS_GET_RECORDS", + cmd_events_get_records, 1, 0 }, + [EVENTS][CLEAR_RECORDS] = { "EVENTS_CLEAR_RECORDS", + cmd_events_clear_records, ~0, IMMEDIATE_LOG_CHANGE }, + [EVENTS][GET_INTERRUPT_POLICY] = { "EVENTS_GET_INTERRUPT_POLICY", + cmd_events_get_interrupt_policy, 0, 0 }, + [EVENTS][SET_INTERRUPT_POLICY] = { "EVENTS_SET_INTERRUPT_POLICY", + cmd_events_set_interrupt_policy, 4, IMMEDIATE_CONFIG_CHANGE }, + [FIRMWARE_UPDATE][GET_INFO] = { "FIRMWARE_UPDATE_GET_INFO", + cmd_firmware_update_get_info, 0, 0 }, + [TIMESTAMP][GET] = { "TIMESTAMP_GET", cmd_timestamp_get, 0, 0 }, + [TIMESTAMP][SET] = { "TIMESTAMP_SET", cmd_timestamp_set, 8, IMMEDIATE_POLICY_CHANGE }, + [LOGS][GET_SUPPORTED] = { "LOGS_GET_SUPPORTED", cmd_logs_get_supported, 0, 0 }, + [LOGS][GET_LOG] = { "LOGS_GET_LOG", cmd_logs_get_log, 0x18, 0 }, + [IDENTIFY][MEMORY_DEVICE] = { "IDENTIFY_MEMORY_DEVICE", + cmd_identify_memory_device, 0, 0 }, + [CCLS][GET_PARTITION_INFO] = { "CCLS_GET_PARTITION_INFO", + cmd_ccls_get_partition_info, 0, 0 }, + [CCLS][GET_LSA] = { "CCLS_GET_LSA", cmd_ccls_get_lsa, 0, 0 }, + [CCLS][SET_LSA] = { "CCLS_SET_LSA", cmd_ccls_set_lsa, + ~0, IMMEDIATE_CONFIG_CHANGE | IMMEDIATE_DATA_CHANGE }, +}; + +void cxl_process_mailbox(CXLDeviceState *cxl_dstate) +{ + uint16_t ret = CXL_MBOX_SUCCESS; + struct cxl_cmd *cxl_cmd; + uint64_t status_reg; + opcode_handler h; + uint64_t command_reg = cxl_dstate->mbox_reg_state64[R_CXL_DEV_MAILBOX_CMD]; + + uint8_t set = FIELD_EX64(command_reg, CXL_DEV_MAILBOX_CMD, COMMAND_SET); + uint8_t cmd = FIELD_EX64(command_reg, CXL_DEV_MAILBOX_CMD, COMMAND); + uint16_t len = FIELD_EX64(command_reg, CXL_DEV_MAILBOX_CMD, LENGTH); + cxl_cmd = &cxl_cmd_set[set][cmd]; + h = cxl_cmd->handler; + if (h) { + if (len == cxl_cmd->in) { + cxl_cmd->payload = cxl_dstate->mbox_reg_state + + A_CXL_DEV_CMD_PAYLOAD; + ret = (*h)(cxl_cmd, cxl_dstate, &len); + assert(len <= cxl_dstate->payload_size); + } else { + ret = CXL_MBOX_INVALID_PAYLOAD_LENGTH; + } + } else { + qemu_log_mask(LOG_UNIMP, "Command %04xh not implemented\n", + set << 8 | cmd); + ret = CXL_MBOX_UNSUPPORTED; + } + + /* Set the return code */ + status_reg = FIELD_DP64(0, CXL_DEV_MAILBOX_STS, ERRNO, ret); + + /* Set the return length */ + command_reg = FIELD_DP64(command_reg, CXL_DEV_MAILBOX_CMD, COMMAND_SET, 0); + command_reg = FIELD_DP64(command_reg, CXL_DEV_MAILBOX_CMD, COMMAND, 0); + command_reg = FIELD_DP64(command_reg, CXL_DEV_MAILBOX_CMD, LENGTH, len); + + cxl_dstate->mbox_reg_state64[R_CXL_DEV_MAILBOX_CMD] = command_reg; + cxl_dstate->mbox_reg_state64[R_CXL_DEV_MAILBOX_STS] = status_reg; + + /* Tell the host we're done */ + ARRAY_FIELD_DP32(cxl_dstate->mbox_reg_state32, CXL_DEV_MAILBOX_CTRL, + DOORBELL, 0); +} + +int cxl_initialize_mailbox(CXLDeviceState *cxl_dstate) +{ + /* CXL 2.0: Table 169 Get Supported Logs Log Entry */ + const char *cel_uuidstr = "0da9c0b5-bf41-4b78-8f79-96b1623b3f17"; + + for (int set = 0; set < 256; set++) { + for (int cmd = 0; cmd < 256; cmd++) { + if (cxl_cmd_set[set][cmd].handler) { + struct cxl_cmd *c = &cxl_cmd_set[set][cmd]; + struct cel_log *log = + &cxl_dstate->cel_log[cxl_dstate->cel_size]; + + log->opcode = (set << 8) | cmd; + log->effect = c->effect; + cxl_dstate->cel_size++; + } + } + } + + return qemu_uuid_parse(cel_uuidstr, &cel_uuid); +} diff --git a/hw/cxl/meson.build b/hw/cxl/meson.build new file mode 100644 index 0000000000..f117b99949 --- /dev/null +++ b/hw/cxl/meson.build @@ -0,0 +1,12 @@ +softmmu_ss.add(when: 'CONFIG_CXL', + if_true: files( + 'cxl-component-utils.c', + 'cxl-device-utils.c', + 'cxl-mailbox-utils.c', + 'cxl-host.c', + ), + if_false: files( + 'cxl-host-stubs.c', + )) + +softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('cxl-host-stubs.c')) diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c index 09818231bd..96e56c4467 100644 --- a/hw/display/vhost-user-gpu.c +++ b/hw/display/vhost-user-gpu.c @@ -565,6 +565,12 @@ vhost_user_gpu_device_realize(DeviceState *qdev, Error **errp) g->vhost_gpu_fd = -1; } +static struct vhost_dev *vhost_user_gpu_get_vhost(VirtIODevice *vdev) +{ + VhostUserGPU *g = VHOST_USER_GPU(vdev); + return &g->vhost->dev; +} + static Property vhost_user_gpu_properties[] = { VIRTIO_GPU_BASE_PROPERTIES(VhostUserGPU, parent_obj.conf), DEFINE_PROP_END_OF_LIST(), @@ -586,6 +592,7 @@ vhost_user_gpu_class_init(ObjectClass *klass, void *data) vdc->guest_notifier_pending = vhost_user_gpu_guest_notifier_pending; vdc->get_config = vhost_user_gpu_get_config; vdc->set_config = vhost_user_gpu_set_config; + vdc->get_vhost = vhost_user_gpu_get_vhost; device_class_set_props(dc, vhost_user_gpu_properties); } diff --git a/hw/display/virtio-gpu-base.c b/hw/display/virtio-gpu-base.c index fff0fb4a82..8ba5da4312 100644 --- a/hw/display/virtio-gpu-base.c +++ b/hw/display/virtio-gpu-base.c @@ -173,7 +173,7 @@ virtio_gpu_base_device_realize(DeviceState *qdev, } g->virtio_config.num_scanouts = cpu_to_le32(g->conf.max_outputs); - virtio_init(VIRTIO_DEVICE(g), "virtio-gpu", VIRTIO_ID_GPU, + virtio_init(VIRTIO_DEVICE(g), VIRTIO_ID_GPU, sizeof(struct virtio_gpu_config)); if (virtio_gpu_virgl_enabled(g->conf)) { diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index dcf6ece3d0..c125939ed6 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -28,6 +28,7 @@ #include "qemu/bitmap.h" #include "qemu/error-report.h" #include "hw/pci/pci.h" +#include "hw/cxl/cxl.h" #include "hw/core/cpu.h" #include "target/i386/cpu.h" #include "hw/misc/pvpanic.h" @@ -66,6 +67,7 @@ #include "hw/acpi/aml-build.h" #include "hw/acpi/utils.h" #include "hw/acpi/pci.h" +#include "hw/acpi/cxl.h" #include "qom/qom-qobject.h" #include "hw/i386/amd_iommu.h" @@ -75,6 +77,7 @@ #include "hw/acpi/ipmi.h" #include "hw/acpi/hmat.h" #include "hw/acpi/viot.h" +#include "hw/acpi/cxl.h" #include CONFIG_DEVICES @@ -1409,6 +1412,22 @@ static void build_smb0(Aml *table, I2CBus *smbus, int devnr, int func) aml_append(table, scope); } +static void build_acpi0017(Aml *table) +{ + Aml *dev, *scope, *method; + + scope = aml_scope("_SB"); + dev = aml_device("CXLM"); + aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0017"))); + + method = aml_method("_STA", 0, AML_NOTSERIALIZED); + aml_append(method, aml_return(aml_int(0x01))); + aml_append(dev, method); + + aml_append(scope, dev); + aml_append(table, scope); +} + static void build_dsdt(GArray *table_data, BIOSLinker *linker, AcpiPmInfo *pm, AcpiMiscInfo *misc, @@ -1428,6 +1447,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, #ifdef CONFIG_TPM TPMIf *tpm = tpm_find(); #endif + bool cxl_present = false; int i; VMBusBridge *vmbus_bridge = vmbus_bridge_find(); AcpiTable table = { .sig = "DSDT", .rev = 1, .oem_id = x86ms->oem_id, @@ -1572,10 +1592,25 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, } scope = aml_scope("\\_SB"); - dev = aml_device("PC%.02X", bus_num); + + if (pci_bus_is_cxl(bus)) { + dev = aml_device("CL%.02X", bus_num); + } else { + dev = aml_device("PC%.02X", bus_num); + } aml_append(dev, aml_name_decl("_UID", aml_int(bus_num))); aml_append(dev, aml_name_decl("_BBN", aml_int(bus_num))); - if (pci_bus_is_express(bus)) { + if (pci_bus_is_cxl(bus)) { + struct Aml *pkg = aml_package(2); + + aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0016"))); + aml_append(pkg, aml_eisaid("PNP0A08")); + aml_append(pkg, aml_eisaid("PNP0A03")); + aml_append(dev, aml_name_decl("_CID", pkg)); + aml_append(dev, aml_name_decl("_ADR", aml_int(0))); + aml_append(dev, aml_name_decl("_UID", aml_int(bus_num))); + build_cxl_osc_method(dev); + } else if (pci_bus_is_express(bus)) { aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A08"))); aml_append(dev, aml_name_decl("_CID", aml_eisaid("PNP0A03"))); @@ -1595,9 +1630,23 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, aml_append(dev, aml_name_decl("_CRS", crs)); aml_append(scope, dev); aml_append(dsdt, scope); + + /* Handle the ranges for the PXB expanders */ + if (pci_bus_is_cxl(bus)) { + MemoryRegion *mr = &machine->cxl_devices_state->host_mr; + uint64_t base = mr->addr; + + cxl_present = true; + crs_range_insert(crs_range_set.mem_ranges, base, + base + memory_region_size(mr) - 1); + } } } + if (cxl_present) { + build_acpi0017(dsdt); + } + /* * At this point crs_range_set has all the ranges used by pci * busses *other* than PCI0. These ranges will be excluded from @@ -2662,6 +2711,10 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) machine->nvdimms_state, machine->ram_slots, x86ms->oem_id, x86ms->oem_table_id); } + if (machine->cxl_devices_state->is_enabled) { + cxl_build_cedt(machine, table_offsets, tables_blob, tables->linker, + x86ms->oem_id, x86ms->oem_table_id); + } acpi_add_table(table_offsets, tables_blob); build_waet(tables_blob, tables->linker, x86ms->oem_id, x86ms->oem_table_id); diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index ea8eaeb330..725f69095b 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -201,15 +201,18 @@ static void amdvi_setevent_bits(uint64_t *buffer, uint64_t value, int start, /* * AMDVi event structure * 0:15 -> DeviceID - * 55:63 -> event type + miscellaneous info - * 63:127 -> related address + * 48:63 -> event type + miscellaneous info + * 64:127 -> related address */ static void amdvi_encode_event(uint64_t *evt, uint16_t devid, uint64_t addr, uint16_t info) { + evt[0] = 0; + evt[1] = 0; + amdvi_setevent_bits(evt, devid, 0, 16); - amdvi_setevent_bits(evt, info, 55, 8); - amdvi_setevent_bits(evt, addr, 63, 64); + amdvi_setevent_bits(evt, info, 48, 16); + amdvi_setevent_bits(evt, addr, 64, 64); } /* log an error encountered during a page walk * @@ -218,7 +221,7 @@ static void amdvi_encode_event(uint64_t *evt, uint16_t devid, uint64_t addr, static void amdvi_page_fault(AMDVIState *s, uint16_t devid, hwaddr addr, uint16_t info) { - uint64_t evt[4]; + uint64_t evt[2]; info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF; amdvi_encode_event(evt, devid, addr, info); @@ -234,7 +237,7 @@ static void amdvi_page_fault(AMDVIState *s, uint16_t devid, static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid, hwaddr devtab, uint16_t info) { - uint64_t evt[4]; + uint64_t evt[2]; info |= AMDVI_EVENT_DEV_TAB_HW_ERROR; @@ -248,7 +251,8 @@ static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid, */ static void amdvi_log_command_error(AMDVIState *s, hwaddr addr) { - uint64_t evt[4], info = AMDVI_EVENT_COMMAND_HW_ERROR; + uint64_t evt[2]; + uint16_t info = AMDVI_EVENT_COMMAND_HW_ERROR; amdvi_encode_event(evt, 0, addr, info); amdvi_log_event(s, evt); @@ -261,7 +265,7 @@ static void amdvi_log_command_error(AMDVIState *s, hwaddr addr) static void amdvi_log_illegalcom_error(AMDVIState *s, uint16_t info, hwaddr addr) { - uint64_t evt[4]; + uint64_t evt[2]; info |= AMDVI_EVENT_ILLEGAL_COMMAND_ERROR; amdvi_encode_event(evt, 0, addr, info); @@ -276,7 +280,7 @@ static void amdvi_log_illegalcom_error(AMDVIState *s, uint16_t info, static void amdvi_log_illegaldevtab_error(AMDVIState *s, uint16_t devid, hwaddr addr, uint16_t info) { - uint64_t evt[4]; + uint64_t evt[2]; info |= AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY; amdvi_encode_event(evt, devid, addr, info); @@ -288,7 +292,7 @@ static void amdvi_log_illegaldevtab_error(AMDVIState *s, uint16_t devid, static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid, hwaddr addr, uint16_t info) { - uint64_t evt[4]; + uint64_t evt[2]; info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR; amdvi_encode_event(evt, devid, addr, info); diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index c64aa81a83..2162394e08 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -181,6 +181,18 @@ static void vtd_update_scalable_state(IntelIOMMUState *s) } } +static void vtd_update_iq_dw(IntelIOMMUState *s) +{ + uint64_t val = vtd_get_quad_raw(s, DMAR_IQA_REG); + + if (s->ecap & VTD_ECAP_SMTS && + val & VTD_IQA_DW_MASK) { + s->iq_dw = true; + } else { + s->iq_dw = false; + } +} + /* Whether the address space needs to notify new mappings */ static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) { @@ -469,11 +481,6 @@ static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id, assert(fault < VTD_FR_MAX); - if (fault == VTD_FR_RESERVED_ERR) { - /* This is not a normal fault reason case. Drop it. */ - return; - } - trace_vtd_dmar_fault(source_id, fault, addr, is_write); if (fsts_reg & VTD_FSTS_PFO) { @@ -1025,6 +1032,7 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, uint32_t offset; uint64_t slpte; uint64_t access_right_check; + uint64_t xlat, size; if (!vtd_iova_range_check(s, iova, ce, aw_bits)) { error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")", @@ -1069,11 +1077,33 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, if (vtd_is_last_slpte(slpte, level)) { *slptep = slpte; *slpte_level = level; - return 0; + break; } addr = vtd_get_slpte_addr(slpte, aw_bits); level--; } + + xlat = vtd_get_slpte_addr(*slptep, aw_bits); + size = ~vtd_slpt_level_page_mask(level) + 1; + + /* + * From VT-d spec 3.14: Untranslated requests and translation + * requests that result in an address in the interrupt range will be + * blocked with condition code LGN.4 or SGN.8. + */ + if ((xlat > VTD_INTERRUPT_ADDR_LAST || + xlat + size - 1 < VTD_INTERRUPT_ADDR_FIRST)) { + return 0; + } else { + error_report_once("%s: xlat address is in interrupt range " + "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", " + "slpte=0x%" PRIx64 ", write=%d, " + "xlat=0x%" PRIx64 ", size=0x%" PRIx64 ")", + __func__, iova, level, slpte, is_write, + xlat, size); + return s->scalable_mode ? -VTD_FR_SM_INTERRUPT_ADDR : + -VTD_FR_INTERRUPT_ADDR; + } } typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private); @@ -1633,11 +1663,12 @@ static const bool vtd_qualified_faults[] = { [VTD_FR_PAGING_ENTRY_INV] = true, [VTD_FR_ROOT_TABLE_INV] = false, [VTD_FR_CONTEXT_TABLE_INV] = false, + [VTD_FR_INTERRUPT_ADDR] = true, [VTD_FR_ROOT_ENTRY_RSVD] = false, [VTD_FR_PAGING_ENTRY_RSVD] = true, [VTD_FR_CONTEXT_ENTRY_TT] = true, [VTD_FR_PASID_TABLE_INV] = false, - [VTD_FR_RESERVED_ERR] = false, + [VTD_FR_SM_INTERRUPT_ADDR] = true, [VTD_FR_MAX] = false, }; @@ -2209,12 +2240,13 @@ static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en) /* Handle write to Global Command Register */ static void vtd_handle_gcmd_write(IntelIOMMUState *s) { + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG); uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG); uint32_t changed = status ^ val; trace_vtd_reg_write_gcmd(status, val); - if (changed & VTD_GCMD_TE) { + if ((changed & VTD_GCMD_TE) && s->dma_translation) { /* Translation enable/disable */ vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); } @@ -2230,7 +2262,8 @@ static void vtd_handle_gcmd_write(IntelIOMMUState *s) /* Set/update the interrupt remapping root-table pointer */ vtd_handle_gcmd_sirtp(s); } - if (changed & VTD_GCMD_IRE) { + if ((changed & VTD_GCMD_IRE) && + x86_iommu_ir_supported(x86_iommu)) { /* Interrupt remap enable/disable */ vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE); } @@ -2883,12 +2916,7 @@ static void vtd_mem_write(void *opaque, hwaddr addr, } else { vtd_set_quad(s, addr, val); } - if (s->ecap & VTD_ECAP_SMTS && - val & VTD_IQA_DW_MASK) { - s->iq_dw = true; - } else { - s->iq_dw = false; - } + vtd_update_iq_dw(s); break; case DMAR_IQA_REG_HI: @@ -3032,7 +3060,7 @@ static int vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, /* TODO: add support for VFIO and vhost users */ if (s->snoop_control) { - error_setg_errno(errp, -ENOTSUP, + error_setg_errno(errp, ENOTSUP, "Snoop Control with vhost or VFIO is not supported"); return -ENOTSUP; } @@ -3052,13 +3080,6 @@ static int vtd_post_load(void *opaque, int version_id) { IntelIOMMUState *iommu = opaque; - /* - * Memory regions are dynamically turned on/off depending on - * context entry configurations from the guest. After migration, - * we need to make sure the memory regions are still correct. - */ - vtd_switch_address_space_all(iommu); - /* * We don't need to migrate the root_scalable because we can * simply do the calculation after the loading is complete. We @@ -3068,6 +3089,15 @@ static int vtd_post_load(void *opaque, int version_id) */ vtd_update_scalable_state(iommu); + vtd_update_iq_dw(iommu); + + /* + * Memory regions are dynamically turned on/off depending on + * context entry configurations from the guest. After migration, + * we need to make sure the memory regions are still correct. + */ + vtd_switch_address_space_all(iommu); + return 0; } @@ -3122,6 +3152,7 @@ static Property vtd_properties[] = { DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE), DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false), DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), + DEFINE_PROP_BOOL("dma-translation", IntelIOMMUState, dma_translation, true), DEFINE_PROP_END_OF_LIST(), }; @@ -3627,12 +3658,17 @@ static void vtd_init(IntelIOMMUState *s) s->next_frcd_reg = 0; s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | - VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits); + VTD_CAP_MGAW(s->aw_bits); if (s->dma_drain) { s->cap |= VTD_CAP_DRAIN; } - if (s->aw_bits == VTD_HOST_AW_48BIT) { - s->cap |= VTD_CAP_SAGAW_48bit; + if (s->dma_translation) { + if (s->aw_bits >= VTD_HOST_AW_39BIT) { + s->cap |= VTD_CAP_SAGAW_39bit; + } + if (s->aw_bits >= VTD_HOST_AW_48BIT) { + s->cap |= VTD_CAP_SAGAW_48bit; + } } s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; @@ -3778,15 +3814,10 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; } if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) { - if (!kvm_irqchip_in_kernel()) { + if (!kvm_irqchip_is_split()) { error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"); return false; } - if (!kvm_enable_x2apic()) { - error_setg(errp, "eim=on requires support on the KVM side" - "(X2APIC_API, first shipped in v4.7)"); - return false; - } } /* Currently only address widths supported are 39 and 48 bits */ diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 1ff13b40f9..930ce61feb 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -289,6 +289,8 @@ typedef enum VTDFaultReason { * context-entry. */ VTD_FR_CONTEXT_ENTRY_TT, + /* Output address in the interrupt address range */ + VTD_FR_INTERRUPT_ADDR = 0xE, /* Interrupt remapping transition faults */ VTD_FR_IR_REQ_RSVD = 0x20, /* One or more IR request reserved @@ -304,11 +306,8 @@ typedef enum VTDFaultReason { VTD_FR_PASID_TABLE_INV = 0x58, /*Invalid PASID table entry */ - /* This is not a normal fault reason. We use this to indicate some faults - * that are not referenced by the VT-d specification. - * Fault event with such reason should not be recorded. - */ - VTD_FR_RESERVED_ERR, + /* Output address in the interrupt address range for scalable mode */ + VTD_FR_SM_INTERRUPT_ADDR = 0x87, VTD_FR_MAX, /* Guard */ } VTDFaultReason; diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c index 4b3b1dd262..754f1d0593 100644 --- a/hw/i386/microvm.c +++ b/hw/i386/microvm.c @@ -247,7 +247,7 @@ static void microvm_devices_init(MicrovmMachineState *mms) x86ms->pci_irq_mask = 0; } - if (mms->pic == ON_OFF_AUTO_ON || mms->pic == ON_OFF_AUTO_AUTO) { + if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) { qemu_irq *i8259; i8259 = i8259_init(isa_bus, x86_allocate_cpu_irq()); @@ -257,7 +257,7 @@ static void microvm_devices_init(MicrovmMachineState *mms) g_free(i8259); } - if (mms->pit == ON_OFF_AUTO_ON || mms->pit == ON_OFF_AUTO_AUTO) { + if (x86ms->pit == ON_OFF_AUTO_ON || x86ms->pit == ON_OFF_AUTO_AUTO) { if (kvm_pit_in_kernel()) { kvm_pit_init(isa_bus, 0x40); } else { @@ -491,40 +491,6 @@ static void microvm_machine_reset(MachineState *machine) } } -static void microvm_machine_get_pic(Object *obj, Visitor *v, const char *name, - void *opaque, Error **errp) -{ - MicrovmMachineState *mms = MICROVM_MACHINE(obj); - OnOffAuto pic = mms->pic; - - visit_type_OnOffAuto(v, name, &pic, errp); -} - -static void microvm_machine_set_pic(Object *obj, Visitor *v, const char *name, - void *opaque, Error **errp) -{ - MicrovmMachineState *mms = MICROVM_MACHINE(obj); - - visit_type_OnOffAuto(v, name, &mms->pic, errp); -} - -static void microvm_machine_get_pit(Object *obj, Visitor *v, const char *name, - void *opaque, Error **errp) -{ - MicrovmMachineState *mms = MICROVM_MACHINE(obj); - OnOffAuto pit = mms->pit; - - visit_type_OnOffAuto(v, name, &pit, errp); -} - -static void microvm_machine_set_pit(Object *obj, Visitor *v, const char *name, - void *opaque, Error **errp) -{ - MicrovmMachineState *mms = MICROVM_MACHINE(obj); - - visit_type_OnOffAuto(v, name, &mms->pit, errp); -} - static void microvm_machine_get_rtc(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { @@ -649,8 +615,6 @@ static void microvm_machine_initfn(Object *obj) MicrovmMachineState *mms = MICROVM_MACHINE(obj); /* Configuration */ - mms->pic = ON_OFF_AUTO_AUTO; - mms->pit = ON_OFF_AUTO_AUTO; mms->rtc = ON_OFF_AUTO_AUTO; mms->pcie = ON_OFF_AUTO_AUTO; mms->ioapic2 = ON_OFF_AUTO_AUTO; @@ -702,20 +666,6 @@ static void microvm_class_init(ObjectClass *oc, void *data) x86mc->fwcfg_dma_enabled = true; - object_class_property_add(oc, MICROVM_MACHINE_PIC, "OnOffAuto", - microvm_machine_get_pic, - microvm_machine_set_pic, - NULL, NULL); - object_class_property_set_description(oc, MICROVM_MACHINE_PIC, - "Enable i8259 PIC"); - - object_class_property_add(oc, MICROVM_MACHINE_PIT, "OnOffAuto", - microvm_machine_get_pit, - microvm_machine_set_pit, - NULL, NULL); - object_class_property_set_description(oc, MICROVM_MACHINE_PIT, - "Enable i8254 PIT"); - object_class_property_add(oc, MICROVM_MACHINE_RTC, "OnOffAuto", microvm_machine_get_rtc, microvm_machine_set_rtc, diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 305d2c0820..7c39c91335 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -75,6 +75,7 @@ #include "acpi-build.h" #include "hw/mem/pc-dimm.h" #include "hw/mem/nvdimm.h" +#include "hw/cxl/cxl.h" #include "qapi/error.h" #include "qapi/qapi-visit-common.h" #include "qapi/qapi-visit-machine.h" @@ -743,14 +744,6 @@ void pc_machine_done(Notifier *notifier, void *data) /* update FW_CFG_NB_CPUS to account for -device added CPUs */ fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus); } - - - if (x86ms->apic_id_limit > 255 && !xen_enabled() && - !kvm_irqchip_in_kernel()) { - error_report("current -smp configuration requires kernel " - "irqchip support."); - exit(EXIT_FAILURE); - } } void pc_guest_info_init(PCMachineState *pcms) @@ -816,6 +809,7 @@ void pc_memory_init(PCMachineState *pcms, MachineClass *mc = MACHINE_GET_CLASS(machine); PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); X86MachineState *x86ms = X86_MACHINE(pcms); + hwaddr cxl_base, cxl_resv_end = 0; assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); @@ -905,6 +899,44 @@ void pc_memory_init(PCMachineState *pcms, &machine->device_memory->mr); } + if (machine->cxl_devices_state->is_enabled) { + MemoryRegion *mr = &machine->cxl_devices_state->host_mr; + hwaddr cxl_size = MiB; + + if (pcmc->has_reserved_memory && machine->device_memory->base) { + cxl_base = machine->device_memory->base; + if (!pcmc->broken_reserved_end) { + cxl_base += memory_region_size(&machine->device_memory->mr); + } + } else if (pcms->sgx_epc.size != 0) { + cxl_base = sgx_epc_above_4g_end(&pcms->sgx_epc); + } else { + cxl_base = 0x100000000ULL + x86ms->above_4g_mem_size; + } + + e820_add_entry(cxl_base, cxl_size, E820_RESERVED); + memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size); + memory_region_add_subregion(system_memory, cxl_base, mr); + cxl_resv_end = cxl_base + cxl_size; + if (machine->cxl_devices_state->fixed_windows) { + hwaddr cxl_fmw_base; + GList *it; + + cxl_fmw_base = ROUND_UP(cxl_base + cxl_size, 256 * MiB); + for (it = machine->cxl_devices_state->fixed_windows; it; it = it->next) { + CXLFixedWindow *fw = it->data; + + fw->base = cxl_fmw_base; + memory_region_init_io(&fw->mr, OBJECT(machine), &cfmws_ops, fw, + "cxl-fixed-memory-region", fw->size); + memory_region_add_subregion(system_memory, fw->base, &fw->mr); + e820_add_entry(fw->base, fw->size, E820_RESERVED); + cxl_fmw_base += fw->size; + cxl_resv_end = cxl_fmw_base; + } + } + } + /* Initialize PC system firmware */ pc_system_firmware_init(pcms, rom_memory); @@ -932,6 +964,10 @@ void pc_memory_init(PCMachineState *pcms, if (!pcmc->broken_reserved_end) { res_mem_end += memory_region_size(&machine->device_memory->mr); } + + if (machine->cxl_devices_state->is_enabled) { + res_mem_end = cxl_resv_end; + } *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB)); fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val)); } @@ -965,7 +1001,17 @@ uint64_t pc_pci_hole64_start(void) X86MachineState *x86ms = X86_MACHINE(pcms); uint64_t hole64_start = 0; - if (pcmc->has_reserved_memory && ms->device_memory->base) { + if (ms->cxl_devices_state->host_mr.addr) { + hole64_start = ms->cxl_devices_state->host_mr.addr + + memory_region_size(&ms->cxl_devices_state->host_mr); + if (ms->cxl_devices_state->fixed_windows) { + GList *it; + for (it = ms->cxl_devices_state->fixed_windows; it; it = it->next) { + CXLFixedWindow *fw = it->data; + hole64_start = fw->mr.addr + memory_region_size(&fw->mr); + } + } + } else if (pcmc->has_reserved_memory && ms->device_memory->base) { hole64_start = ms->device_memory->base; if (!pcmc->broken_reserved_end) { hole64_start += memory_region_size(&ms->device_memory->mr); @@ -1077,6 +1123,7 @@ void pc_basic_device_init(struct PCMachineState *pcms, ISADevice *pit = NULL; MemoryRegion *ioport80_io = g_new(MemoryRegion, 1); MemoryRegion *ioportF0_io = g_new(MemoryRegion, 1); + X86MachineState *x86ms = X86_MACHINE(pcms); memory_region_init_io(ioport80_io, NULL, &ioport80_io_ops, NULL, "ioport80", 1); memory_region_add_subregion(isa_bus->address_space_io, 0x80, ioport80_io); @@ -1121,7 +1168,8 @@ void pc_basic_device_init(struct PCMachineState *pcms, qemu_register_boot_set(pc_boot_set, *rtc_state); - if (!xen_enabled() && pcms->pit_enabled) { + if (!xen_enabled() && + (x86ms->pit == ON_OFF_AUTO_AUTO || x86ms->pit == ON_OFF_AUTO_ON)) { if (kvm_pit_in_kernel()) { pit = kvm_pit_init(isa_bus, 0x40); } else { @@ -1491,20 +1539,6 @@ static void pc_machine_set_sata(Object *obj, bool value, Error **errp) pcms->sata_enabled = value; } -static bool pc_machine_get_pit(Object *obj, Error **errp) -{ - PCMachineState *pcms = PC_MACHINE(obj); - - return pcms->pit_enabled; -} - -static void pc_machine_set_pit(Object *obj, bool value, Error **errp) -{ - PCMachineState *pcms = PC_MACHINE(obj); - - pcms->pit_enabled = value; -} - static bool pc_machine_get_hpet(Object *obj, Error **errp) { PCMachineState *pcms = PC_MACHINE(obj); @@ -1661,7 +1695,6 @@ static void pc_machine_initfn(Object *obj) pcms->acpi_build_enabled = PC_MACHINE_GET_CLASS(pcms)->has_acpi_build; pcms->smbus_enabled = true; pcms->sata_enabled = true; - pcms->pit_enabled = true; pcms->i8042_enabled = true; pcms->max_fw_size = 8 * MiB; #ifdef CONFIG_HPET @@ -1761,6 +1794,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) mc->default_cpu_type = TARGET_DEFAULT_CPU_TYPE; mc->nvdimm_supported = true; mc->smp_props.dies_supported = true; + mc->cxl_supported = true; mc->default_ram_id = "pc.ram"; object_class_property_add(oc, PC_MACHINE_MAX_RAM_BELOW_4G, "size", @@ -1789,11 +1823,6 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) object_class_property_set_description(oc, PC_MACHINE_SATA, "Enable/disable Serial ATA bus"); - object_class_property_add_bool(oc, PC_MACHINE_PIT, - pc_machine_get_pit, pc_machine_set_pit); - object_class_property_set_description(oc, PC_MACHINE_PIT, - "Enable/disable Intel 8254 programmable interval timer emulation"); - object_class_property_add_bool(oc, "hpet", pc_machine_get_hpet, pc_machine_set_hpet); object_class_property_set_description(oc, "hpet", diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index f843dd906f..578e537b35 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -218,7 +218,9 @@ static void pc_init1(MachineState *machine, } isa_bus_irqs(isa_bus, x86ms->gsi); - pc_i8259_create(isa_bus, gsi_state->i8259_irq); + if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) { + pc_i8259_create(isa_bus, gsi_state->i8259_irq); + } if (pcmc->pci_enabled) { ioapic_init_gsi(gsi_state, "i440fx"); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 302288342a..42eb8b9707 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -265,7 +265,9 @@ static void pc_q35_init(MachineState *machine) pci_bus_set_route_irq_fn(host_bus, ich9_route_intx_pin_to_irq); isa_bus = ich9_lpc->isa_bus; - pc_i8259_create(isa_bus, gsi_state->i8259_irq); + if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) { + pc_i8259_create(isa_bus, gsi_state->i8259_irq); + } if (pcmc->pci_enabled) { ioapic_init_gsi(gsi_state, "q35"); diff --git a/hw/i386/x86.c b/hw/i386/x86.c index 79ebdface6..78b05ab7a2 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -38,6 +38,7 @@ #include "sysemu/replay.h" #include "sysemu/sysemu.h" #include "sysemu/cpu-timers.h" +#include "sysemu/xen.h" #include "trace.h" #include "hw/i386/x86.h" @@ -122,6 +123,21 @@ void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version) */ x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms, ms->smp.max_cpus - 1) + 1; + + /* + * Can we support APIC ID 255 or higher? + * + * Under Xen: yes. + * With userspace emulated lapic: no + * With KVM's in-kernel lapic: only if X2APIC API is enabled. + */ + if (x86ms->apic_id_limit > 255 && !xen_enabled() && + (!kvm_irqchip_in_kernel() || !kvm_enable_x2apic())) { + error_report("current -smp configuration requires kernel " + "irqchip and X2APIC API support."); + exit(EXIT_FAILURE); + } + possible_cpus = mc->possible_cpu_arch_ids(ms); for (i = 0; i < ms->smp.cpus; i++) { x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal); @@ -1228,6 +1244,40 @@ static void x86_machine_set_acpi(Object *obj, Visitor *v, const char *name, visit_type_OnOffAuto(v, name, &x86ms->acpi, errp); } +static void x86_machine_get_pit(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + OnOffAuto pit = x86ms->pit; + + visit_type_OnOffAuto(v, name, &pit, errp); +} + +static void x86_machine_set_pit(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj);; + + visit_type_OnOffAuto(v, name, &x86ms->pit, errp); +} + +static void x86_machine_get_pic(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + OnOffAuto pic = x86ms->pic; + + visit_type_OnOffAuto(v, name, &pic, errp); +} + +static void x86_machine_set_pic(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + + visit_type_OnOffAuto(v, name, &x86ms->pic, errp); +} + static char *x86_machine_get_oem_id(Object *obj, Error **errp) { X86MachineState *x86ms = X86_MACHINE(obj); @@ -1317,6 +1367,8 @@ static void x86_machine_initfn(Object *obj) x86ms->smm = ON_OFF_AUTO_AUTO; x86ms->acpi = ON_OFF_AUTO_AUTO; + x86ms->pit = ON_OFF_AUTO_AUTO; + x86ms->pic = ON_OFF_AUTO_AUTO; x86ms->pci_irq_mask = ACPI_BUILD_PCI_IRQS; x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); @@ -1348,6 +1400,20 @@ static void x86_machine_class_init(ObjectClass *oc, void *data) object_class_property_set_description(oc, X86_MACHINE_ACPI, "Enable ACPI"); + object_class_property_add(oc, X86_MACHINE_PIT, "OnOffAuto", + x86_machine_get_pit, + x86_machine_set_pit, + NULL, NULL); + object_class_property_set_description(oc, X86_MACHINE_PIT, + "Enable i8254 PIT"); + + object_class_property_add(oc, X86_MACHINE_PIC, "OnOffAuto", + x86_machine_get_pic, + x86_machine_set_pic, + NULL, NULL); + object_class_property_set_description(oc, X86_MACHINE_PIC, + "Enable i8259 PIC"); + object_class_property_add_str(oc, X86_MACHINE_OEM_ID, x86_machine_get_oem_id, x86_machine_set_oem_id); diff --git a/hw/input/vhost-user-input.c b/hw/input/vhost-user-input.c index aeb0624fe5..1352e372ff 100644 --- a/hw/input/vhost-user-input.c +++ b/hw/input/vhost-user-input.c @@ -78,6 +78,12 @@ static void vhost_input_set_config(VirtIODevice *vdev, virtio_notify_config(vdev); } +static struct vhost_dev *vhost_input_get_vhost(VirtIODevice *vdev) +{ + VHostUserInput *vhi = VHOST_USER_INPUT(vdev); + return &vhi->vhost->dev; +} + static const VMStateDescription vmstate_vhost_input = { .name = "vhost-user-input", .unmigratable = 1, @@ -92,6 +98,7 @@ static void vhost_input_class_init(ObjectClass *klass, void *data) dc->vmsd = &vmstate_vhost_input; vdc->get_config = vhost_input_get_config; vdc->set_config = vhost_input_set_config; + vdc->get_vhost = vhost_input_get_vhost; vic->realize = vhost_input_realize; vic->change_active = vhost_input_change_active; } diff --git a/hw/input/virtio-input.c b/hw/input/virtio-input.c index 54bcb46c74..5b5398b3ca 100644 --- a/hw/input/virtio-input.c +++ b/hw/input/virtio-input.c @@ -257,8 +257,7 @@ static void virtio_input_device_realize(DeviceState *dev, Error **errp) vinput->cfg_size += 8; assert(vinput->cfg_size <= sizeof(virtio_input_config)); - virtio_init(vdev, "virtio-input", VIRTIO_ID_INPUT, - vinput->cfg_size); + virtio_init(vdev, VIRTIO_ID_INPUT, vinput->cfg_size); vinput->evt = virtio_add_queue(vdev, 64, virtio_input_handle_evt); vinput->sts = virtio_add_queue(vdev, 64, virtio_input_handle_sts); } diff --git a/hw/mem/Kconfig b/hw/mem/Kconfig index 03dbb3c7df..73c5ae8ad9 100644 --- a/hw/mem/Kconfig +++ b/hw/mem/Kconfig @@ -11,3 +11,8 @@ config NVDIMM config SPARSE_MEM bool + +config CXL_MEM_DEVICE + bool + default y if CXL + select MEM_DEVICE diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c new file mode 100644 index 0000000000..3bf2869573 --- /dev/null +++ b/hw/mem/cxl_type3.c @@ -0,0 +1,371 @@ +#include "qemu/osdep.h" +#include "qemu/units.h" +#include "qemu/error-report.h" +#include "hw/mem/memory-device.h" +#include "hw/mem/pc-dimm.h" +#include "hw/pci/pci.h" +#include "hw/qdev-properties.h" +#include "qapi/error.h" +#include "qemu/log.h" +#include "qemu/module.h" +#include "qemu/pmem.h" +#include "qemu/range.h" +#include "qemu/rcu.h" +#include "sysemu/hostmem.h" +#include "hw/cxl/cxl.h" + +static void build_dvsecs(CXLType3Dev *ct3d) +{ + CXLComponentState *cxl_cstate = &ct3d->cxl_cstate; + uint8_t *dvsec; + + dvsec = (uint8_t *)&(CXLDVSECDevice){ + .cap = 0x1e, + .ctrl = 0x2, + .status2 = 0x2, + .range1_size_hi = ct3d->hostmem->size >> 32, + .range1_size_lo = (2 << 5) | (2 << 2) | 0x3 | + (ct3d->hostmem->size & 0xF0000000), + .range1_base_hi = 0, + .range1_base_lo = 0, + }; + cxl_component_create_dvsec(cxl_cstate, CXL2_TYPE3_DEVICE, + PCIE_CXL_DEVICE_DVSEC_LENGTH, + PCIE_CXL_DEVICE_DVSEC, + PCIE_CXL2_DEVICE_DVSEC_REVID, dvsec); + + dvsec = (uint8_t *)&(CXLDVSECRegisterLocator){ + .rsvd = 0, + .reg0_base_lo = RBI_COMPONENT_REG | CXL_COMPONENT_REG_BAR_IDX, + .reg0_base_hi = 0, + .reg1_base_lo = RBI_CXL_DEVICE_REG | CXL_DEVICE_REG_BAR_IDX, + .reg1_base_hi = 0, + }; + cxl_component_create_dvsec(cxl_cstate, CXL2_TYPE3_DEVICE, + REG_LOC_DVSEC_LENGTH, REG_LOC_DVSEC, + REG_LOC_DVSEC_REVID, dvsec); + dvsec = (uint8_t *)&(CXLDVSECDeviceGPF){ + .phase2_duration = 0x603, /* 3 seconds */ + .phase2_power = 0x33, /* 0x33 miliwatts */ + }; + cxl_component_create_dvsec(cxl_cstate, CXL2_TYPE3_DEVICE, + GPF_DEVICE_DVSEC_LENGTH, GPF_PORT_DVSEC, + GPF_DEVICE_DVSEC_REVID, dvsec); +} + +static void hdm_decoder_commit(CXLType3Dev *ct3d, int which) +{ + ComponentRegisters *cregs = &ct3d->cxl_cstate.crb; + uint32_t *cache_mem = cregs->cache_mem_registers; + + assert(which == 0); + + /* TODO: Sanity checks that the decoder is possible */ + ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, COMMIT, 0); + ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, ERR, 0); + + ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, COMMITTED, 1); +} + +static void ct3d_reg_write(void *opaque, hwaddr offset, uint64_t value, + unsigned size) +{ + CXLComponentState *cxl_cstate = opaque; + ComponentRegisters *cregs = &cxl_cstate->crb; + CXLType3Dev *ct3d = container_of(cxl_cstate, CXLType3Dev, cxl_cstate); + uint32_t *cache_mem = cregs->cache_mem_registers; + bool should_commit = false; + int which_hdm = -1; + + assert(size == 4); + g_assert(offset < CXL2_COMPONENT_CM_REGION_SIZE); + + switch (offset) { + case A_CXL_HDM_DECODER0_CTRL: + should_commit = FIELD_EX32(value, CXL_HDM_DECODER0_CTRL, COMMIT); + which_hdm = 0; + break; + default: + break; + } + + stl_le_p((uint8_t *)cache_mem + offset, value); + if (should_commit) { + hdm_decoder_commit(ct3d, which_hdm); + } +} + +static bool cxl_setup_memory(CXLType3Dev *ct3d, Error **errp) +{ + DeviceState *ds = DEVICE(ct3d); + MemoryRegion *mr; + char *name; + + if (!ct3d->hostmem) { + error_setg(errp, "memdev property must be set"); + return false; + } + + mr = host_memory_backend_get_memory(ct3d->hostmem); + if (!mr) { + error_setg(errp, "memdev property must be set"); + return false; + } + memory_region_set_nonvolatile(mr, true); + memory_region_set_enabled(mr, true); + host_memory_backend_set_mapped(ct3d->hostmem, true); + + if (ds->id) { + name = g_strdup_printf("cxl-type3-dpa-space:%s", ds->id); + } else { + name = g_strdup("cxl-type3-dpa-space"); + } + address_space_init(&ct3d->hostmem_as, mr, name); + g_free(name); + + ct3d->cxl_dstate.pmem_size = ct3d->hostmem->size; + + if (!ct3d->lsa) { + error_setg(errp, "lsa property must be set"); + return false; + } + + return true; +} + +static void ct3_realize(PCIDevice *pci_dev, Error **errp) +{ + CXLType3Dev *ct3d = CXL_TYPE3(pci_dev); + CXLComponentState *cxl_cstate = &ct3d->cxl_cstate; + ComponentRegisters *regs = &cxl_cstate->crb; + MemoryRegion *mr = ®s->component_registers; + uint8_t *pci_conf = pci_dev->config; + + if (!cxl_setup_memory(ct3d, errp)) { + return; + } + + pci_config_set_prog_interface(pci_conf, 0x10); + pci_config_set_class(pci_conf, PCI_CLASS_MEMORY_CXL); + + pcie_endpoint_cap_init(pci_dev, 0x80); + cxl_cstate->dvsec_offset = 0x100; + + ct3d->cxl_cstate.pdev = pci_dev; + build_dvsecs(ct3d); + + regs->special_ops = g_new0(MemoryRegionOps, 1); + regs->special_ops->write = ct3d_reg_write; + + cxl_component_register_block_init(OBJECT(pci_dev), cxl_cstate, + TYPE_CXL_TYPE3); + + pci_register_bar( + pci_dev, CXL_COMPONENT_REG_BAR_IDX, + PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64, mr); + + cxl_device_register_block_init(OBJECT(pci_dev), &ct3d->cxl_dstate); + pci_register_bar(pci_dev, CXL_DEVICE_REG_BAR_IDX, + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64, + &ct3d->cxl_dstate.device_registers); +} + +static void ct3_exit(PCIDevice *pci_dev) +{ + CXLType3Dev *ct3d = CXL_TYPE3(pci_dev); + CXLComponentState *cxl_cstate = &ct3d->cxl_cstate; + ComponentRegisters *regs = &cxl_cstate->crb; + + g_free(regs->special_ops); + address_space_destroy(&ct3d->hostmem_as); +} + +/* TODO: Support multiple HDM decoders and DPA skip */ +static bool cxl_type3_dpa(CXLType3Dev *ct3d, hwaddr host_addr, uint64_t *dpa) +{ + uint32_t *cache_mem = ct3d->cxl_cstate.crb.cache_mem_registers; + uint64_t decoder_base, decoder_size, hpa_offset; + uint32_t hdm0_ctrl; + int ig, iw; + + decoder_base = (((uint64_t)cache_mem[R_CXL_HDM_DECODER0_BASE_HI] << 32) | + cache_mem[R_CXL_HDM_DECODER0_BASE_LO]); + if ((uint64_t)host_addr < decoder_base) { + return false; + } + + hpa_offset = (uint64_t)host_addr - decoder_base; + + decoder_size = ((uint64_t)cache_mem[R_CXL_HDM_DECODER0_SIZE_HI] << 32) | + cache_mem[R_CXL_HDM_DECODER0_SIZE_LO]; + if (hpa_offset >= decoder_size) { + return false; + } + + hdm0_ctrl = cache_mem[R_CXL_HDM_DECODER0_CTRL]; + iw = FIELD_EX32(hdm0_ctrl, CXL_HDM_DECODER0_CTRL, IW); + ig = FIELD_EX32(hdm0_ctrl, CXL_HDM_DECODER0_CTRL, IG); + + *dpa = (MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) | + ((MAKE_64BIT_MASK(8 + ig + iw, 64 - 8 - ig - iw) & hpa_offset) >> iw); + + return true; +} + +MemTxResult cxl_type3_read(PCIDevice *d, hwaddr host_addr, uint64_t *data, + unsigned size, MemTxAttrs attrs) +{ + CXLType3Dev *ct3d = CXL_TYPE3(d); + uint64_t dpa_offset; + MemoryRegion *mr; + + /* TODO support volatile region */ + mr = host_memory_backend_get_memory(ct3d->hostmem); + if (!mr) { + return MEMTX_ERROR; + } + + if (!cxl_type3_dpa(ct3d, host_addr, &dpa_offset)) { + return MEMTX_ERROR; + } + + if (dpa_offset > int128_get64(mr->size)) { + return MEMTX_ERROR; + } + + return address_space_read(&ct3d->hostmem_as, dpa_offset, attrs, data, size); +} + +MemTxResult cxl_type3_write(PCIDevice *d, hwaddr host_addr, uint64_t data, + unsigned size, MemTxAttrs attrs) +{ + CXLType3Dev *ct3d = CXL_TYPE3(d); + uint64_t dpa_offset; + MemoryRegion *mr; + + mr = host_memory_backend_get_memory(ct3d->hostmem); + if (!mr) { + return MEMTX_OK; + } + + if (!cxl_type3_dpa(ct3d, host_addr, &dpa_offset)) { + return MEMTX_OK; + } + + if (dpa_offset > int128_get64(mr->size)) { + return MEMTX_OK; + } + return address_space_write(&ct3d->hostmem_as, dpa_offset, attrs, + &data, size); +} + +static void ct3d_reset(DeviceState *dev) +{ + CXLType3Dev *ct3d = CXL_TYPE3(dev); + uint32_t *reg_state = ct3d->cxl_cstate.crb.cache_mem_registers; + uint32_t *write_msk = ct3d->cxl_cstate.crb.cache_mem_regs_write_mask; + + cxl_component_register_init_common(reg_state, write_msk, CXL2_TYPE3_DEVICE); + cxl_device_register_init_common(&ct3d->cxl_dstate); +} + +static Property ct3_props[] = { + DEFINE_PROP_LINK("memdev", CXLType3Dev, hostmem, TYPE_MEMORY_BACKEND, + HostMemoryBackend *), + DEFINE_PROP_LINK("lsa", CXLType3Dev, lsa, TYPE_MEMORY_BACKEND, + HostMemoryBackend *), + DEFINE_PROP_END_OF_LIST(), +}; + +static uint64_t get_lsa_size(CXLType3Dev *ct3d) +{ + MemoryRegion *mr; + + mr = host_memory_backend_get_memory(ct3d->lsa); + return memory_region_size(mr); +} + +static void validate_lsa_access(MemoryRegion *mr, uint64_t size, + uint64_t offset) +{ + assert(offset + size <= memory_region_size(mr)); + assert(offset + size > offset); +} + +static uint64_t get_lsa(CXLType3Dev *ct3d, void *buf, uint64_t size, + uint64_t offset) +{ + MemoryRegion *mr; + void *lsa; + + mr = host_memory_backend_get_memory(ct3d->lsa); + validate_lsa_access(mr, size, offset); + + lsa = memory_region_get_ram_ptr(mr) + offset; + memcpy(buf, lsa, size); + + return size; +} + +static void set_lsa(CXLType3Dev *ct3d, const void *buf, uint64_t size, + uint64_t offset) +{ + MemoryRegion *mr; + void *lsa; + + mr = host_memory_backend_get_memory(ct3d->lsa); + validate_lsa_access(mr, size, offset); + + lsa = memory_region_get_ram_ptr(mr) + offset; + memcpy(lsa, buf, size); + memory_region_set_dirty(mr, offset, size); + + /* + * Just like the PMEM, if the guest is not allowed to exit gracefully, label + * updates will get lost. + */ +} + +static void ct3_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc); + CXLType3Class *cvc = CXL_TYPE3_CLASS(oc); + + pc->realize = ct3_realize; + pc->exit = ct3_exit; + pc->class_id = PCI_CLASS_STORAGE_EXPRESS; + pc->vendor_id = PCI_VENDOR_ID_INTEL; + pc->device_id = 0xd93; /* LVF for now */ + pc->revision = 1; + + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + dc->desc = "CXL PMEM Device (Type 3)"; + dc->reset = ct3d_reset; + device_class_set_props(dc, ct3_props); + + cvc->get_lsa_size = get_lsa_size; + cvc->get_lsa = get_lsa; + cvc->set_lsa = set_lsa; +} + +static const TypeInfo ct3d_info = { + .name = TYPE_CXL_TYPE3, + .parent = TYPE_PCI_DEVICE, + .class_size = sizeof(struct CXLType3Class), + .class_init = ct3_class_init, + .instance_size = sizeof(CXLType3Dev), + .interfaces = (InterfaceInfo[]) { + { INTERFACE_CXL_DEVICE }, + { INTERFACE_PCIE_DEVICE }, + {} + }, +}; + +static void ct3d_registers(void) +{ + type_register_static(&ct3d_info); +} + +type_init(ct3d_registers); diff --git a/hw/mem/meson.build b/hw/mem/meson.build index 82f86d117e..609b2b36fc 100644 --- a/hw/mem/meson.build +++ b/hw/mem/meson.build @@ -3,6 +3,7 @@ mem_ss.add(files('memory-device.c')) mem_ss.add(when: 'CONFIG_DIMM', if_true: files('pc-dimm.c')) mem_ss.add(when: 'CONFIG_NPCM7XX', if_true: files('npcm7xx_mc.c')) mem_ss.add(when: 'CONFIG_NVDIMM', if_true: files('nvdimm.c')) +mem_ss.add(when: 'CONFIG_CXL_MEM_DEVICE', if_true: files('cxl_type3.c')) softmmu_ss.add_all(when: 'CONFIG_MEM_DEVICE', if_true: mem_ss) diff --git a/hw/meson.build b/hw/meson.build index b3366c888e..9992c5101e 100644 --- a/hw/meson.build +++ b/hw/meson.build @@ -6,6 +6,7 @@ subdir('block') subdir('char') subdir('core') subdir('cpu') +subdir('cxl') subdir('display') subdir('dma') subdir('gpio') diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 30379d2ca4..ccac5b7a64 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -201,7 +201,7 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF); } if (~net->dev.features & net->dev.backend_features) { - fprintf(stderr, "vhost lacks feature mask %" PRIu64 + fprintf(stderr, "vhost lacks feature mask 0x%" PRIx64 " for backend\n", (uint64_t)(~net->dev.features & net->dev.backend_features)); goto fail; @@ -213,7 +213,7 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) if (net->nc->info->type == NET_CLIENT_DRIVER_VHOST_USER) { features = vhost_user_get_acked_features(net->nc); if (~net->dev.features & features) { - fprintf(stderr, "vhost lacks feature mask %" PRIu64 + fprintf(stderr, "vhost lacks feature mask 0x%" PRIx64 " for backend\n", (uint64_t)(~net->dev.features & features)); goto fail; @@ -381,6 +381,7 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, r = vhost_set_vring_enable(peer, peer->vring_enable); if (r < 0) { + vhost_net_stop_one(get_vhost_net(peer), dev); goto err_start; } } @@ -390,7 +391,8 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, err_start: while (--i >= 0) { - peer = qemu_get_peer(ncs , i); + peer = qemu_get_peer(ncs, i < data_queue_pairs ? + i : n->max_queue_pairs); vhost_net_stop_one(get_vhost_net(peer), dev); } e = k->set_guest_notifiers(qbus->parent, total_notifiers, false); diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 1067e72b39..7ad948ee7c 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -14,6 +14,7 @@ #include "qemu/osdep.h" #include "qemu/atomic.h" #include "qemu/iov.h" +#include "qemu/log.h" #include "qemu/main-loop.h" #include "qemu/module.h" #include "hw/virtio/virtio.h" @@ -245,7 +246,8 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) VirtIODevice *vdev = VIRTIO_DEVICE(n); NetClientState *nc = qemu_get_queue(n->nic); int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1; - int cvq = n->max_ncs - n->max_queue_pairs; + int cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ? + n->max_ncs - n->max_queue_pairs : 0; if (!get_vhost_net(nc->peer)) { return; @@ -1379,6 +1381,7 @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd, { VirtIODevice *vdev = VIRTIO_DEVICE(n); uint16_t queue_pairs; + NetClientState *nc = qemu_get_queue(n->nic); virtio_net_disable_rss(n); if (cmd == VIRTIO_NET_CTRL_MQ_HASH_CONFIG) { @@ -1410,6 +1413,18 @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd, return VIRTIO_NET_ERR; } + /* Avoid changing the number of queue_pairs for vdpa device in + * userspace handler. A future fix is needed to handle the mq + * change in userspace handler with vhost-vdpa. Let's disable + * the mq handling from userspace for now and only allow get + * done through the kernel. Ripples may be seen when falling + * back to userspace, but without doing it qemu process would + * crash on a recursive entry to virtio_net_set_status(). + */ + if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) { + return VIRTIO_NET_ERR; + } + n->curr_queue_pairs = queue_pairs; /* stop the backend before changing the number of queue_pairs to avoid handling a * disabled queue */ @@ -1443,7 +1458,8 @@ static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) } iov_cnt = elem->out_num; - iov2 = iov = g_memdup(elem->out_sg, sizeof(struct iovec) * elem->out_num); + iov2 = iov = g_memdup2(elem->out_sg, + sizeof(struct iovec) * elem->out_num); s = iov_to_buf(iov, iov_cnt, 0, &ctrl, sizeof(ctrl)); iov_discard_front(&iov, &iov_cnt, sizeof(ctrl)); if (s != sizeof(ctrl)) { @@ -3170,8 +3186,22 @@ static NetClientInfo net_virtio_info = { static bool virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx) { VirtIONet *n = VIRTIO_NET(vdev); - NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(idx)); + NetClientState *nc; assert(n->vhost_started); + if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) { + /* Must guard against invalid features and bogus queue index + * from being set by malicious guest, or penetrated through + * buggy migration stream. + */ + if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: bogus vq index ignored\n", __func__); + return false; + } + nc = qemu_get_subqueue(n->nic, n->max_queue_pairs); + } else { + nc = qemu_get_subqueue(n->nic, vq2q(idx)); + } return vhost_net_virtqueue_pending(get_vhost_net(nc->peer), idx); } @@ -3179,8 +3209,22 @@ static void virtio_net_guest_notifier_mask(VirtIODevice *vdev, int idx, bool mask) { VirtIONet *n = VIRTIO_NET(vdev); - NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(idx)); + NetClientState *nc; assert(n->vhost_started); + if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) { + /* Must guard against invalid features and bogus queue index + * from being set by malicious guest, or penetrated through + * buggy migration stream. + */ + if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: bogus vq index ignored\n", __func__); + return; + } + nc = qemu_get_subqueue(n->nic, n->max_queue_pairs); + } else { + nc = qemu_get_subqueue(n->nic, vq2q(idx)); + } vhost_net_virtqueue_mask(get_vhost_net(nc->peer), vdev, idx, mask); } @@ -3392,7 +3436,7 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) } virtio_net_set_config_size(n, n->host_features); - virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size); + virtio_init(vdev, VIRTIO_ID_NET, n->config_size); /* * We set a lower limit on RX queue size to what it always was. @@ -3619,6 +3663,14 @@ static bool dev_unplug_pending(void *opaque) return vdc->primary_unplug_pending(dev); } +static struct vhost_dev *virtio_net_get_vhost(VirtIODevice *vdev) +{ + VirtIONet *n = VIRTIO_NET(vdev); + NetClientState *nc = qemu_get_queue(n->nic); + struct vhost_net *net = get_vhost_net(nc->peer); + return &net->dev; +} + static const VMStateDescription vmstate_virtio_net = { .name = "virtio-net", .minimum_version_id = VIRTIO_NET_VM_VERSION, @@ -3721,6 +3773,7 @@ static void virtio_net_class_init(ObjectClass *klass, void *data) vdc->post_load = virtio_net_post_load_virtio; vdc->vmsd = &vmstate_virtio_net_device; vdc->primary_unplug_pending = primary_unplug_pending; + vdc->get_vhost = virtio_net_get_vhost; } static const TypeInfo virtio_net_info = { diff --git a/hw/pci-bridge/Kconfig b/hw/pci-bridge/Kconfig index f8df4315ba..02614f49aa 100644 --- a/hw/pci-bridge/Kconfig +++ b/hw/pci-bridge/Kconfig @@ -27,3 +27,8 @@ config DEC_PCI config SIMBA bool + +config CXL + bool + default y if PCI_EXPRESS && PXB + depends on PCI_EXPRESS && MSI_NONBROKEN && PXB diff --git a/hw/pci-bridge/cxl_root_port.c b/hw/pci-bridge/cxl_root_port.c new file mode 100644 index 0000000000..fb213fa06e --- /dev/null +++ b/hw/pci-bridge/cxl_root_port.c @@ -0,0 +1,236 @@ +/* + * CXL 2.0 Root Port Implementation + * + * Copyright(C) 2020 Intel Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include "qemu/range.h" +#include "hw/pci/pci_bridge.h" +#include "hw/pci/pcie_port.h" +#include "hw/qdev-properties.h" +#include "hw/sysbus.h" +#include "qapi/error.h" +#include "hw/cxl/cxl.h" + +#define CXL_ROOT_PORT_DID 0x7075 + +/* Copied from the gen root port which we derive */ +#define GEN_PCIE_ROOT_PORT_AER_OFFSET 0x100 +#define GEN_PCIE_ROOT_PORT_ACS_OFFSET \ + (GEN_PCIE_ROOT_PORT_AER_OFFSET + PCI_ERR_SIZEOF) +#define CXL_ROOT_PORT_DVSEC_OFFSET \ + (GEN_PCIE_ROOT_PORT_ACS_OFFSET + PCI_ACS_SIZEOF) + +typedef struct CXLRootPort { + /*< private >*/ + PCIESlot parent_obj; + + CXLComponentState cxl_cstate; + PCIResReserve res_reserve; +} CXLRootPort; + +#define TYPE_CXL_ROOT_PORT "cxl-rp" +DECLARE_INSTANCE_CHECKER(CXLRootPort, CXL_ROOT_PORT, TYPE_CXL_ROOT_PORT) + +static void latch_registers(CXLRootPort *crp) +{ + uint32_t *reg_state = crp->cxl_cstate.crb.cache_mem_registers; + uint32_t *write_msk = crp->cxl_cstate.crb.cache_mem_regs_write_mask; + + cxl_component_register_init_common(reg_state, write_msk, CXL2_ROOT_PORT); +} + +static void build_dvsecs(CXLComponentState *cxl) +{ + uint8_t *dvsec; + + dvsec = (uint8_t *)&(CXLDVSECPortExtensions){ 0 }; + cxl_component_create_dvsec(cxl, CXL2_ROOT_PORT, + EXTENSIONS_PORT_DVSEC_LENGTH, + EXTENSIONS_PORT_DVSEC, + EXTENSIONS_PORT_DVSEC_REVID, dvsec); + + dvsec = (uint8_t *)&(CXLDVSECPortGPF){ + .rsvd = 0, + .phase1_ctrl = 1, /* 1μs timeout */ + .phase2_ctrl = 1, /* 1μs timeout */ + }; + cxl_component_create_dvsec(cxl, CXL2_ROOT_PORT, + GPF_PORT_DVSEC_LENGTH, GPF_PORT_DVSEC, + GPF_PORT_DVSEC_REVID, dvsec); + + dvsec = (uint8_t *)&(CXLDVSECPortFlexBus){ + .cap = 0x26, /* IO, Mem, non-MLD */ + .ctrl = 0x2, + .status = 0x26, /* same */ + .rcvd_mod_ts_data_phase1 = 0xef, + }; + cxl_component_create_dvsec(cxl, CXL2_ROOT_PORT, + PCIE_FLEXBUS_PORT_DVSEC_LENGTH_2_0, + PCIE_FLEXBUS_PORT_DVSEC, + PCIE_FLEXBUS_PORT_DVSEC_REVID_2_0, dvsec); + + dvsec = (uint8_t *)&(CXLDVSECRegisterLocator){ + .rsvd = 0, + .reg0_base_lo = RBI_COMPONENT_REG | CXL_COMPONENT_REG_BAR_IDX, + .reg0_base_hi = 0, + }; + cxl_component_create_dvsec(cxl, CXL2_ROOT_PORT, + REG_LOC_DVSEC_LENGTH, REG_LOC_DVSEC, + REG_LOC_DVSEC_REVID, dvsec); +} + +static void cxl_rp_realize(DeviceState *dev, Error **errp) +{ + PCIDevice *pci_dev = PCI_DEVICE(dev); + PCIERootPortClass *rpc = PCIE_ROOT_PORT_GET_CLASS(dev); + CXLRootPort *crp = CXL_ROOT_PORT(dev); + CXLComponentState *cxl_cstate = &crp->cxl_cstate; + ComponentRegisters *cregs = &cxl_cstate->crb; + MemoryRegion *component_bar = &cregs->component_registers; + Error *local_err = NULL; + + rpc->parent_realize(dev, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + int rc = + pci_bridge_qemu_reserve_cap_init(pci_dev, 0, crp->res_reserve, errp); + if (rc < 0) { + rpc->parent_class.exit(pci_dev); + return; + } + + if (!crp->res_reserve.io || crp->res_reserve.io == -1) { + pci_word_test_and_clear_mask(pci_dev->wmask + PCI_COMMAND, + PCI_COMMAND_IO); + pci_dev->wmask[PCI_IO_BASE] = 0; + pci_dev->wmask[PCI_IO_LIMIT] = 0; + } + + cxl_cstate->dvsec_offset = CXL_ROOT_PORT_DVSEC_OFFSET; + cxl_cstate->pdev = pci_dev; + build_dvsecs(&crp->cxl_cstate); + + cxl_component_register_block_init(OBJECT(pci_dev), cxl_cstate, + TYPE_CXL_ROOT_PORT); + + pci_register_bar(pci_dev, CXL_COMPONENT_REG_BAR_IDX, + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64, + component_bar); +} + +static void cxl_rp_reset(DeviceState *dev) +{ + PCIERootPortClass *rpc = PCIE_ROOT_PORT_GET_CLASS(dev); + CXLRootPort *crp = CXL_ROOT_PORT(dev); + + rpc->parent_reset(dev); + + latch_registers(crp); +} + +static Property gen_rp_props[] = { + DEFINE_PROP_UINT32("bus-reserve", CXLRootPort, res_reserve.bus, -1), + DEFINE_PROP_SIZE("io-reserve", CXLRootPort, res_reserve.io, -1), + DEFINE_PROP_SIZE("mem-reserve", CXLRootPort, res_reserve.mem_non_pref, -1), + DEFINE_PROP_SIZE("pref32-reserve", CXLRootPort, res_reserve.mem_pref_32, + -1), + DEFINE_PROP_SIZE("pref64-reserve", CXLRootPort, res_reserve.mem_pref_64, + -1), + DEFINE_PROP_END_OF_LIST() +}; + +static void cxl_rp_dvsec_write_config(PCIDevice *dev, uint32_t addr, + uint32_t val, int len) +{ + CXLRootPort *crp = CXL_ROOT_PORT(dev); + + if (range_contains(&crp->cxl_cstate.dvsecs[EXTENSIONS_PORT_DVSEC], addr)) { + uint8_t *reg = &dev->config[addr]; + addr -= crp->cxl_cstate.dvsecs[EXTENSIONS_PORT_DVSEC].lob; + if (addr == PORT_CONTROL_OFFSET) { + if (pci_get_word(reg) & PORT_CONTROL_UNMASK_SBR) { + /* unmask SBR */ + qemu_log_mask(LOG_UNIMP, "SBR mask control is not supported\n"); + } + if (pci_get_word(reg) & PORT_CONTROL_ALT_MEMID_EN) { + /* Alt Memory & ID Space Enable */ + qemu_log_mask(LOG_UNIMP, + "Alt Memory & ID space is not supported\n"); + } + } + } +} + +static void cxl_rp_write_config(PCIDevice *d, uint32_t address, uint32_t val, + int len) +{ + uint16_t slt_ctl, slt_sta; + + pcie_cap_slot_get(d, &slt_ctl, &slt_sta); + pci_bridge_write_config(d, address, val, len); + pcie_cap_flr_write_config(d, address, val, len); + pcie_cap_slot_write_config(d, slt_ctl, slt_sta, address, val, len); + pcie_aer_write_config(d, address, val, len); + + cxl_rp_dvsec_write_config(d, address, val, len); +} + +static void cxl_root_port_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + PCIDeviceClass *k = PCI_DEVICE_CLASS(oc); + PCIERootPortClass *rpc = PCIE_ROOT_PORT_CLASS(oc); + + k->vendor_id = PCI_VENDOR_ID_INTEL; + k->device_id = CXL_ROOT_PORT_DID; + dc->desc = "CXL Root Port"; + k->revision = 0; + device_class_set_props(dc, gen_rp_props); + k->config_write = cxl_rp_write_config; + + device_class_set_parent_realize(dc, cxl_rp_realize, &rpc->parent_realize); + device_class_set_parent_reset(dc, cxl_rp_reset, &rpc->parent_reset); + + rpc->aer_offset = GEN_PCIE_ROOT_PORT_AER_OFFSET; + rpc->acs_offset = GEN_PCIE_ROOT_PORT_ACS_OFFSET; + + dc->hotpluggable = false; +} + +static const TypeInfo cxl_root_port_info = { + .name = TYPE_CXL_ROOT_PORT, + .parent = TYPE_PCIE_ROOT_PORT, + .instance_size = sizeof(CXLRootPort), + .class_init = cxl_root_port_class_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_CXL_DEVICE }, + { } + }, +}; + +static void cxl_register(void) +{ + type_register_static(&cxl_root_port_info); +} + +type_init(cxl_register); diff --git a/hw/pci-bridge/meson.build b/hw/pci-bridge/meson.build index daab8acf2a..b6d26a03d5 100644 --- a/hw/pci-bridge/meson.build +++ b/hw/pci-bridge/meson.build @@ -5,6 +5,7 @@ pci_ss.add(when: 'CONFIG_IOH3420', if_true: files('ioh3420.c')) pci_ss.add(when: 'CONFIG_PCIE_PORT', if_true: files('pcie_root_port.c', 'gen_pcie_root_port.c', 'pcie_pci_bridge.c')) pci_ss.add(when: 'CONFIG_PXB', if_true: files('pci_expander_bridge.c')) pci_ss.add(when: 'CONFIG_XIO3130', if_true: files('xio3130_upstream.c', 'xio3130_downstream.c')) +pci_ss.add(when: 'CONFIG_CXL', if_true: files('cxl_root_port.c')) # NewWorld PowerMac pci_ss.add(when: 'CONFIG_DEC_PCI', if_true: files('dec.c')) diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c index de932286b5..69244decdb 100644 --- a/hw/pci-bridge/pci_expander_bridge.c +++ b/hw/pci-bridge/pci_expander_bridge.c @@ -17,6 +17,7 @@ #include "hw/pci/pci_host.h" #include "hw/qdev-properties.h" #include "hw/pci/pci_bridge.h" +#include "hw/cxl/cxl.h" #include "qemu/range.h" #include "qemu/error-report.h" #include "qemu/module.h" @@ -24,6 +25,8 @@ #include "hw/boards.h" #include "qom/object.h" +enum BusType { PCI, PCIE, CXL }; + #define TYPE_PXB_BUS "pxb-bus" typedef struct PXBBus PXBBus; DECLARE_INSTANCE_CHECKER(PXBBus, PXB_BUS, @@ -33,6 +36,10 @@ DECLARE_INSTANCE_CHECKER(PXBBus, PXB_BUS, DECLARE_INSTANCE_CHECKER(PXBBus, PXB_PCIE_BUS, TYPE_PXB_PCIE_BUS) +#define TYPE_PXB_CXL_BUS "pxb-cxl-bus" +DECLARE_INSTANCE_CHECKER(PXBBus, PXB_CXL_BUS, + TYPE_PXB_CXL_BUS) + struct PXBBus { /*< private >*/ PCIBus parent_obj; @@ -50,18 +57,13 @@ DECLARE_INSTANCE_CHECKER(PXBDev, PXB_DEV, DECLARE_INSTANCE_CHECKER(PXBDev, PXB_PCIE_DEV, TYPE_PXB_PCIE_DEVICE) -struct PXBDev { - /*< private >*/ - PCIDevice parent_obj; - /*< public >*/ - - uint8_t bus_nr; - uint16_t numa_node; - bool bypass_iommu; -}; - static PXBDev *convert_to_pxb(PCIDevice *dev) { + /* A CXL PXB's parent bus is PCIe, so the normal check won't work */ + if (object_dynamic_cast(OBJECT(dev), TYPE_PXB_CXL_DEVICE)) { + return PXB_CXL_DEV(dev); + } + return pci_bus_is_express(pci_get_bus(dev)) ? PXB_PCIE_DEV(dev) : PXB_DEV(dev); } @@ -70,6 +72,13 @@ static GList *pxb_dev_list; #define TYPE_PXB_HOST "pxb-host" +CXLComponentState *cxl_get_hb_cstate(PCIHostState *hb) +{ + CXLHost *host = PXB_CXL_HOST(hb); + + return &host->cxl_cstate; +} + static int pxb_bus_num(PCIBus *bus) { PXBDev *pxb = convert_to_pxb(bus->parent_dev); @@ -106,11 +115,20 @@ static const TypeInfo pxb_pcie_bus_info = { .class_init = pxb_bus_class_init, }; +static const TypeInfo pxb_cxl_bus_info = { + .name = TYPE_PXB_CXL_BUS, + .parent = TYPE_CXL_BUS, + .instance_size = sizeof(PXBBus), + .class_init = pxb_bus_class_init, +}; + static const char *pxb_host_root_bus_path(PCIHostState *host_bridge, PCIBus *rootbus) { - PXBBus *bus = pci_bus_is_express(rootbus) ? - PXB_PCIE_BUS(rootbus) : PXB_BUS(rootbus); + PXBBus *bus = pci_bus_is_cxl(rootbus) ? + PXB_CXL_BUS(rootbus) : + pci_bus_is_express(rootbus) ? PXB_PCIE_BUS(rootbus) : + PXB_BUS(rootbus); snprintf(bus->bus_path, 8, "0000:%02x", pxb_bus_num(rootbus)); return bus->bus_path; @@ -166,6 +184,52 @@ static const TypeInfo pxb_host_info = { .class_init = pxb_host_class_init, }; +static void pxb_cxl_realize(DeviceState *dev, Error **errp) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + SysBusDevice *sbd = SYS_BUS_DEVICE(dev); + CXLHost *cxl = PXB_CXL_HOST(dev); + CXLComponentState *cxl_cstate = &cxl->cxl_cstate; + struct MemoryRegion *mr = &cxl_cstate->crb.component_registers; + hwaddr offset; + + cxl_component_register_block_init(OBJECT(dev), cxl_cstate, + TYPE_PXB_CXL_HOST); + sysbus_init_mmio(sbd, mr); + + offset = memory_region_size(mr) * ms->cxl_devices_state->next_mr_idx; + if (offset > memory_region_size(&ms->cxl_devices_state->host_mr)) { + error_setg(errp, "Insufficient space for pxb cxl host register space"); + return; + } + + memory_region_add_subregion(&ms->cxl_devices_state->host_mr, offset, mr); + ms->cxl_devices_state->next_mr_idx++; +} + +static void pxb_cxl_host_class_init(ObjectClass *class, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(class); + PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(class); + + hc->root_bus_path = pxb_host_root_bus_path; + dc->fw_name = "cxl"; + dc->realize = pxb_cxl_realize; + /* Reason: Internal part of the pxb/pxb-pcie device, not usable by itself */ + dc->user_creatable = false; +} + +/* + * This is a device to handle the MMIO for a CXL host bridge. It does nothing + * else. + */ +static const TypeInfo cxl_host_info = { + .name = TYPE_PXB_CXL_HOST, + .parent = TYPE_PCI_HOST_BRIDGE, + .instance_size = sizeof(CXLHost), + .class_init = pxb_cxl_host_class_init, +}; + /* * Registers the PXB bus as a child of pci host root bus. */ @@ -212,6 +276,17 @@ static int pxb_map_irq_fn(PCIDevice *pci_dev, int pin) return pin - PCI_SLOT(pxb->devfn); } +static void pxb_dev_reset(DeviceState *dev) +{ + CXLHost *cxl = PXB_CXL_DEV(dev)->cxl.cxl_host_bridge; + CXLComponentState *cxl_cstate = &cxl->cxl_cstate; + uint32_t *reg_state = cxl_cstate->crb.cache_mem_registers; + uint32_t *write_msk = cxl_cstate->crb.cache_mem_regs_write_mask; + + cxl_component_register_init_common(reg_state, write_msk, CXL2_ROOT_PORT); + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, TARGET_COUNT, 8); +} + static gint pxb_compare(gconstpointer a, gconstpointer b) { const PXBDev *pxb_a = a, *pxb_b = b; @@ -221,7 +296,8 @@ static gint pxb_compare(gconstpointer a, gconstpointer b) 0; } -static void pxb_dev_realize_common(PCIDevice *dev, bool pcie, Error **errp) +static void pxb_dev_realize_common(PCIDevice *dev, enum BusType type, + Error **errp) { PXBDev *pxb = convert_to_pxb(dev); DeviceState *ds, *bds = NULL; @@ -245,9 +321,13 @@ static void pxb_dev_realize_common(PCIDevice *dev, bool pcie, Error **errp) dev_name = dev->qdev.id; } - ds = qdev_new(TYPE_PXB_HOST); - if (pcie) { + ds = qdev_new(type == CXL ? TYPE_PXB_CXL_HOST : TYPE_PXB_HOST); + if (type == PCIE) { bus = pci_root_bus_new(ds, dev_name, NULL, NULL, 0, TYPE_PXB_PCIE_BUS); + } else if (type == CXL) { + bus = pci_root_bus_new(ds, dev_name, NULL, NULL, 0, TYPE_PXB_CXL_BUS); + bus->flags |= PCI_BUS_CXL; + PXB_CXL_DEV(dev)->cxl.cxl_host_bridge = PXB_CXL_HOST(ds); } else { bus = pci_root_bus_new(ds, "pxb-internal", NULL, NULL, 0, TYPE_PXB_BUS); bds = qdev_new("pci-bridge"); @@ -295,7 +375,7 @@ static void pxb_dev_realize(PCIDevice *dev, Error **errp) return; } - pxb_dev_realize_common(dev, false, errp); + pxb_dev_realize_common(dev, PCI, errp); } static void pxb_dev_exitfn(PCIDevice *pci_dev) @@ -348,7 +428,7 @@ static void pxb_pcie_dev_realize(PCIDevice *dev, Error **errp) return; } - pxb_dev_realize_common(dev, true, errp); + pxb_dev_realize_common(dev, PCIE, errp); } static void pxb_pcie_dev_class_init(ObjectClass *klass, void *data) @@ -379,13 +459,67 @@ static const TypeInfo pxb_pcie_dev_info = { }, }; +static void pxb_cxl_dev_realize(PCIDevice *dev, Error **errp) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + + /* A CXL PXB's parent bus is still PCIe */ + if (!pci_bus_is_express(pci_get_bus(dev))) { + error_setg(errp, "pxb-cxl devices cannot reside on a PCI bus"); + return; + } + if (!ms->cxl_devices_state->is_enabled) { + error_setg(errp, "Machine does not have cxl=on"); + return; + } + + pxb_dev_realize_common(dev, CXL, errp); + pxb_dev_reset(DEVICE(dev)); +} + +static void pxb_cxl_dev_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + + k->realize = pxb_cxl_dev_realize; + k->exit = pxb_dev_exitfn; + /* + * XXX: These types of bridges don't actually show up in the hierarchy so + * vendor, device, class, etc. ids are intentionally left out. + */ + + dc->desc = "CXL Host Bridge"; + device_class_set_props(dc, pxb_dev_properties); + set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); + + /* Host bridges aren't hotpluggable. FIXME: spec reference */ + dc->hotpluggable = false; + dc->reset = pxb_dev_reset; +} + +static const TypeInfo pxb_cxl_dev_info = { + .name = TYPE_PXB_CXL_DEVICE, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(PXBDev), + .class_init = pxb_cxl_dev_class_init, + .interfaces = + (InterfaceInfo[]){ + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + {}, + }, +}; + static void pxb_register_types(void) { type_register_static(&pxb_bus_info); type_register_static(&pxb_pcie_bus_info); + type_register_static(&pxb_cxl_bus_info); type_register_static(&pxb_host_info); + type_register_static(&cxl_host_info); type_register_static(&pxb_dev_info); type_register_static(&pxb_pcie_dev_info); + type_register_static(&pxb_cxl_dev_info); } type_init(pxb_register_types) diff --git a/hw/pci-bridge/pcie_root_port.c b/hw/pci-bridge/pcie_root_port.c index f1cfe9d14a..460e48269d 100644 --- a/hw/pci-bridge/pcie_root_port.c +++ b/hw/pci-bridge/pcie_root_port.c @@ -67,7 +67,11 @@ static void rp_realize(PCIDevice *d, Error **errp) int rc; pci_config_set_interrupt_pin(d->config, 1); - pci_bridge_initfn(d, TYPE_PCIE_BUS); + if (d->cap_present & QEMU_PCIE_CAP_CXL) { + pci_bridge_initfn(d, TYPE_CXL_BUS); + } else { + pci_bridge_initfn(d, TYPE_PCIE_BUS); + } pcie_port_init_reg(d); rc = pci_bridge_ssvid_init(d, rpc->ssvid_offset, dc->vendor_id, diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c index e7e162a00a..7c7316bc96 100644 --- a/hw/pci-host/gpex-acpi.c +++ b/hw/pci-host/gpex-acpi.c @@ -5,6 +5,7 @@ #include "hw/pci/pci_bus.h" #include "hw/pci/pci_bridge.h" #include "hw/pci/pcie_host.h" +#include "hw/acpi/cxl.h" static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq) { @@ -139,6 +140,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) QLIST_FOREACH(bus, &bus->child, sibling) { uint8_t bus_num = pci_bus_num(bus); uint8_t numa_node = pci_bus_numa_node(bus); + bool is_cxl = pci_bus_is_cxl(bus); if (!pci_bus_is_root(bus)) { continue; @@ -154,8 +156,16 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) } dev = aml_device("PC%.02X", bus_num); - aml_append(dev, aml_name_decl("_HID", aml_string("PNP0A08"))); - aml_append(dev, aml_name_decl("_CID", aml_string("PNP0A03"))); + if (is_cxl) { + struct Aml *pkg = aml_package(2); + aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0016"))); + aml_append(pkg, aml_eisaid("PNP0A08")); + aml_append(pkg, aml_eisaid("PNP0A03")); + aml_append(dev, aml_name_decl("_CID", pkg)); + } else { + aml_append(dev, aml_name_decl("_HID", aml_string("PNP0A08"))); + aml_append(dev, aml_name_decl("_CID", aml_string("PNP0A03"))); + } aml_append(dev, aml_name_decl("_BBN", aml_int(bus_num))); aml_append(dev, aml_name_decl("_UID", aml_int(bus_num))); aml_append(dev, aml_name_decl("_STR", aml_unicode("pxb Device"))); @@ -175,7 +185,11 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) cfg->pio.base, 0, 0, 0); aml_append(dev, aml_name_decl("_CRS", crs)); - acpi_dsdt_add_pci_osc(dev); + if (is_cxl) { + build_cxl_osc_method(dev); + } else { + acpi_dsdt_add_pci_osc(dev); + } aml_append(scope, dev); } diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 9c58f02853..a9b37f8000 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -200,6 +200,11 @@ static const TypeInfo pci_bus_info = { .class_init = pci_bus_class_init, }; +static const TypeInfo cxl_interface_info = { + .name = INTERFACE_CXL_DEVICE, + .parent = TYPE_INTERFACE, +}; + static const TypeInfo pcie_interface_info = { .name = INTERFACE_PCIE_DEVICE, .parent = TYPE_INTERFACE, @@ -223,6 +228,12 @@ static const TypeInfo pcie_bus_info = { .class_init = pcie_bus_class_init, }; +static const TypeInfo cxl_bus_info = { + .name = TYPE_CXL_BUS, + .parent = TYPE_PCIE_BUS, + .class_init = pcie_bus_class_init, +}; + static PCIBus *pci_find_bus_nr(PCIBus *bus, int bus_num); static void pci_update_mappings(PCIDevice *d); static void pci_irq_handler(void *opaque, int irq_num, int level); @@ -2182,6 +2193,10 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp) pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; } + if (object_class_dynamic_cast(klass, INTERFACE_CXL_DEVICE)) { + pci_dev->cap_present |= QEMU_PCIE_CAP_CXL; + } + pci_dev = do_pci_register_device(pci_dev, object_get_typename(OBJECT(qdev)), pci_dev->devfn, errp); @@ -2747,7 +2762,9 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data) object_class_dynamic_cast(klass, INTERFACE_CONVENTIONAL_PCI_DEVICE); ObjectClass *pcie = object_class_dynamic_cast(klass, INTERFACE_PCIE_DEVICE); - assert(conventional || pcie); + ObjectClass *cxl = + object_class_dynamic_cast(klass, INTERFACE_CXL_DEVICE); + assert(conventional || pcie || cxl); } } @@ -2937,7 +2954,9 @@ static void pci_register_types(void) { type_register_static(&pci_bus_info); type_register_static(&pcie_bus_info); + type_register_static(&cxl_bus_info); type_register_static(&conventional_pci_interface_info); + type_register_static(&cxl_interface_info); type_register_static(&pcie_interface_info); type_register_static(&pci_device_type_info); } diff --git a/hw/pci/pcie_port.c b/hw/pci/pcie_port.c index e95c1e5519..687e4e763a 100644 --- a/hw/pci/pcie_port.c +++ b/hw/pci/pcie_port.c @@ -136,6 +136,31 @@ static void pcie_port_class_init(ObjectClass *oc, void *data) device_class_set_props(dc, pcie_port_props); } +PCIDevice *pcie_find_port_by_pn(PCIBus *bus, uint8_t pn) +{ + int devfn; + + for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) { + PCIDevice *d = bus->devices[devfn]; + PCIEPort *port; + + if (!d || !pci_is_express(d) || !d->exp.exp_cap) { + continue; + } + + if (!object_dynamic_cast(OBJECT(d), TYPE_PCIE_PORT)) { + continue; + } + + port = PCIE_PORT(d); + if (port->port == pn) { + return d; + } + } + + return NULL; +} + static const TypeInfo pcie_port_type_info = { .name = TYPE_PCIE_PORT, .parent = TYPE_PCI_BRIDGE, diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index 778f43e4c1..3059068175 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -273,6 +273,13 @@ static void vhost_scsi_unrealize(DeviceState *dev) virtio_scsi_common_unrealize(dev); } +static struct vhost_dev *vhost_scsi_get_vhost(VirtIODevice *vdev) +{ + VHostSCSI *s = VHOST_SCSI(vdev); + VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s); + return &vsc->dev; +} + static Property vhost_scsi_properties[] = { DEFINE_PROP_STRING("vhostfd", VirtIOSCSICommon, conf.vhostfd), DEFINE_PROP_STRING("wwpn", VirtIOSCSICommon, conf.wwpn), @@ -307,6 +314,7 @@ static void vhost_scsi_class_init(ObjectClass *klass, void *data) vdc->get_features = vhost_scsi_common_get_features; vdc->set_config = vhost_scsi_common_set_config; vdc->set_status = vhost_scsi_set_status; + vdc->get_vhost = vhost_scsi_get_vhost; fwc->get_dev_path = vhost_scsi_common_get_fw_dev_path; } diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c index 1b2f7eed98..9be21d07ee 100644 --- a/hw/scsi/vhost-user-scsi.c +++ b/hw/scsi/vhost-user-scsi.c @@ -121,6 +121,7 @@ static void vhost_user_scsi_realize(DeviceState *dev, Error **errp) vsc->dev.backend_features = 0; vqs = vsc->dev.vqs; + s->vhost_user.supports_config = true; ret = vhost_dev_init(&vsc->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0, errp); if (ret < 0) { diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index db54d104be..4141dddd51 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -1013,8 +1013,7 @@ void virtio_scsi_common_realize(DeviceState *dev, VirtIOSCSICommon *s = VIRTIO_SCSI_COMMON(dev); int i; - virtio_init(vdev, "virtio-scsi", VIRTIO_ID_SCSI, - sizeof(VirtIOSCSIConfig)); + virtio_init(vdev, VIRTIO_ID_SCSI, sizeof(VirtIOSCSIConfig)); if (s->conf.num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) { s->conf.num_queues = 1; diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 333348d9d5..ab8e095b73 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -21,6 +21,9 @@ vhost_user_set_mem_table_withfd(int index, const char *name, uint64_t memory_siz vhost_user_postcopy_waker(const char *rb, uint64_t rb_offset) "%s + 0x%"PRIx64 vhost_user_postcopy_waker_found(uint64_t client_addr) "0x%"PRIx64 vhost_user_postcopy_waker_nomatch(const char *rb, uint64_t rb_offset) "%s + 0x%"PRIx64 +vhost_user_read(uint32_t req, uint32_t flags) "req:%d flags:0x%"PRIx32"" +vhost_user_write(uint32_t req, uint32_t flags) "req:%d flags:0x%"PRIx32"" +vhost_user_create_notifier(int idx, void *n) "idx:%d n:%p" # vhost-vdpa.c vhost_vdpa_dma_map(void *vdpa, int fd, uint32_t msg_type, uint64_t iova, uint64_t size, uint64_t uaddr, uint8_t perm, uint8_t type) "vdpa:%p fd: %d msg_type: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" uaddr: 0x%"PRIx64" perm: 0x%"PRIx8" type: %"PRIu8 @@ -89,7 +92,12 @@ virtio_mmio_guest_page(uint64_t size, int shift) "guest page size 0x%" PRIx64 " virtio_mmio_queue_write(uint64_t value, int max_size) "mmio_queue write 0x%" PRIx64 " max %d" virtio_mmio_setting_irq(int level) "virtio_mmio setting IRQ %d" -# virtio-iommu.c +# virtio-pci.c +virtio_pci_notify(uint16_t vector) "virtio_pci_notify vec 0x%x" +virtio_pci_notify_write(uint64_t addr, uint64_t val, unsigned int size) "0x%" PRIx64" = 0x%" PRIx64 " (%d)" +virtio_pci_notify_write_pio(uint64_t addr, uint64_t val, unsigned int size) "0x%" PRIx64" = 0x%" PRIx64 " (%d)" + +# hw/virtio/virtio-iommu.c virtio_iommu_device_reset(void) "reset!" virtio_iommu_system_reset(void) "system reset!" virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64 diff --git a/hw/virtio/vhost-scsi-pci.c b/hw/virtio/vhost-scsi-pci.c index cb71a294fa..08980bc23b 100644 --- a/hw/virtio/vhost-scsi-pci.c +++ b/hw/virtio/vhost-scsi-pci.c @@ -21,7 +21,7 @@ #include "hw/virtio/vhost-scsi.h" #include "qapi/error.h" #include "qemu/module.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "qom/object.h" typedef struct VHostSCSIPCI VHostSCSIPCI; diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 1e5cfe2af6..56c96ebd13 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -138,6 +138,7 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, for (n = 0; n < num; n++) { if (more_descs || (n + 1 < num)) { descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT); + descs[i].next = cpu_to_le16(svq->desc_next[i]); } else { descs[i].flags = flags; } @@ -145,10 +146,10 @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, descs[i].len = cpu_to_le32(iovec[n].iov_len); last = i; - i = cpu_to_le16(descs[i].next); + i = cpu_to_le16(svq->desc_next[i]); } - svq->free_head = le16_to_cpu(descs[last].next); + svq->free_head = le16_to_cpu(svq->desc_next[last]); } static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, @@ -198,11 +199,19 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, return true; } +/** + * Add an element to a SVQ. + * + * The caller must check that there is enough slots for the new element. It + * takes ownership of the element: In case of failure, it is free and the SVQ + * is considered broken. + */ static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem) { unsigned qemu_head; bool ok = vhost_svq_add_split(svq, elem, &qemu_head); if (unlikely(!ok)) { + g_free(elem); return false; } @@ -333,13 +342,22 @@ static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq) svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT); } +static uint16_t vhost_svq_last_desc_of_chain(const VhostShadowVirtqueue *svq, + uint16_t num, uint16_t i) +{ + for (uint16_t j = 0; j < (num - 1); ++j) { + i = le16_to_cpu(svq->desc_next[i]); + } + + return i; +} + static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq, uint32_t *len) { - vring_desc_t *descs = svq->vring.desc; const vring_used_t *used = svq->vring.used; vring_used_elem_t used_elem; - uint16_t last_used; + uint16_t last_used, last_used_chain, num; if (!vhost_svq_more_used(svq)) { return NULL; @@ -365,7 +383,10 @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq, return NULL; } - descs[used_elem.id].next = svq->free_head; + num = svq->ring_id_maps[used_elem.id]->in_num + + svq->ring_id_maps[used_elem.id]->out_num; + last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id); + svq->desc_next[last_used_chain] = svq->free_head; svq->free_head = used_elem.id; *len = used_elem.len; @@ -540,8 +561,9 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, svq->vring.used = qemu_memalign(qemu_real_host_page_size(), device_size); memset(svq->vring.used, 0, device_size); svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num); + svq->desc_next = g_new0(uint16_t, svq->vring.num); for (unsigned i = 0; i < svq->vring.num - 1; i++) { - svq->vring.desc[i].next = cpu_to_le16(i + 1); + svq->desc_next[i] = cpu_to_le16(i + 1); } } @@ -574,6 +596,7 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq) virtqueue_detach_element(svq->vq, next_avail_elem, 0); } svq->vq = NULL; + g_free(svq->desc_next); g_free(svq->ring_id_maps); qemu_vfree(svq->vring.desc); qemu_vfree(svq->vring.used); diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h index e5e24c536d..c132c994e9 100644 --- a/hw/virtio/vhost-shadow-virtqueue.h +++ b/hw/virtio/vhost-shadow-virtqueue.h @@ -53,6 +53,12 @@ typedef struct VhostShadowVirtqueue { /* Next VirtQueue element that guest made available */ VirtQueueElement *next_guest_avail_elem; + /* + * Backup next field for each descriptor so we can recover securely, not + * needing to trust the device access. + */ + uint16_t *desc_next; + /* Next head to expose to the device */ uint16_t shadow_avail_idx; diff --git a/hw/virtio/vhost-user-blk-pci.c b/hw/virtio/vhost-user-blk-pci.c index 33b404d8a2..eef8641a98 100644 --- a/hw/virtio/vhost-user-blk-pci.c +++ b/hw/virtio/vhost-user-blk-pci.c @@ -26,7 +26,7 @@ #include "qapi/error.h" #include "qemu/error-report.h" #include "qemu/module.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "qom/object.h" typedef struct VHostUserBlkPCI VHostUserBlkPCI; diff --git a/hw/virtio/vhost-user-fs-pci.c b/hw/virtio/vhost-user-fs-pci.c index 2ed8492b3f..6829b8b743 100644 --- a/hw/virtio/vhost-user-fs-pci.c +++ b/hw/virtio/vhost-user-fs-pci.c @@ -14,7 +14,7 @@ #include "qemu/osdep.h" #include "hw/qdev-properties.h" #include "hw/virtio/vhost-user-fs.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "qom/object.h" struct VHostUserFSPCI { diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c index c595957983..e513e4fdda 100644 --- a/hw/virtio/vhost-user-fs.c +++ b/hw/virtio/vhost-user-fs.c @@ -219,8 +219,7 @@ static void vuf_device_realize(DeviceState *dev, Error **errp) return; } - virtio_init(vdev, "vhost-user-fs", VIRTIO_ID_FS, - sizeof(struct virtio_fs_config)); + virtio_init(vdev, VIRTIO_ID_FS, sizeof(struct virtio_fs_config)); /* Hiprio queue */ fs->hiprio_vq = virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output); @@ -277,6 +276,12 @@ static void vuf_device_unrealize(DeviceState *dev) fs->vhost_dev.vqs = NULL; } +static struct vhost_dev *vuf_get_vhost(VirtIODevice *vdev) +{ + VHostUserFS *fs = VHOST_USER_FS(vdev); + return &fs->vhost_dev; +} + static const VMStateDescription vuf_vmstate = { .name = "vhost-user-fs", .unmigratable = 1, @@ -314,6 +319,7 @@ static void vuf_class_init(ObjectClass *klass, void *data) vdc->set_status = vuf_set_status; vdc->guest_notifier_mask = vuf_guest_notifier_mask; vdc->guest_notifier_pending = vuf_guest_notifier_pending; + vdc->get_vhost = vuf_get_vhost; } static const TypeInfo vuf_info = { diff --git a/hw/virtio/vhost-user-i2c-pci.c b/hw/virtio/vhost-user-i2c-pci.c index 70b7b65fd9..00ac10941f 100644 --- a/hw/virtio/vhost-user-i2c-pci.c +++ b/hw/virtio/vhost-user-i2c-pci.c @@ -9,7 +9,7 @@ #include "qemu/osdep.h" #include "hw/qdev-properties.h" #include "hw/virtio/vhost-user-i2c.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" struct VHostUserI2CPCI { VirtIOPCIProxy parent_obj; diff --git a/hw/virtio/vhost-user-i2c.c b/hw/virtio/vhost-user-i2c.c index 42c7f6d9e5..6020eee093 100644 --- a/hw/virtio/vhost-user-i2c.c +++ b/hw/virtio/vhost-user-i2c.c @@ -14,11 +14,6 @@ #include "qemu/error-report.h" #include "standard-headers/linux/virtio_ids.h" -/* Remove this once the header is updated in Linux kernel */ -#ifndef VIRTIO_ID_I2C_ADAPTER -#define VIRTIO_ID_I2C_ADAPTER 34 -#endif - static const int feature_bits[] = { VIRTIO_I2C_F_ZERO_LENGTH_REQUEST, VHOST_INVALID_FEATURE_BIT @@ -227,7 +222,7 @@ static void vu_i2c_device_realize(DeviceState *dev, Error **errp) return; } - virtio_init(vdev, "vhost-user-i2c", VIRTIO_ID_I2C_ADAPTER, 0); + virtio_init(vdev, VIRTIO_ID_I2C_ADAPTER, 0); i2c->vhost_dev.nvqs = 1; i2c->vq = virtio_add_queue(vdev, 4, vu_i2c_handle_output); diff --git a/hw/virtio/vhost-user-input-pci.c b/hw/virtio/vhost-user-input-pci.c index c9d3e9113a..b858898a36 100644 --- a/hw/virtio/vhost-user-input-pci.c +++ b/hw/virtio/vhost-user-input-pci.c @@ -9,7 +9,7 @@ #include "hw/virtio/virtio-input.h" #include "qapi/error.h" #include "qemu/error-report.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "qom/object.h" typedef struct VHostUserInputPCI VHostUserInputPCI; diff --git a/hw/virtio/vhost-user-rng-pci.c b/hw/virtio/vhost-user-rng-pci.c index c83dc86813..f64935453b 100644 --- a/hw/virtio/vhost-user-rng-pci.c +++ b/hw/virtio/vhost-user-rng-pci.c @@ -9,7 +9,7 @@ #include "qemu/osdep.h" #include "hw/qdev-properties.h" #include "hw/virtio/vhost-user-rng.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" struct VHostUserRNGPCI { VirtIOPCIProxy parent_obj; diff --git a/hw/virtio/vhost-user-rng.c b/hw/virtio/vhost-user-rng.c index 209ee5bf9a..3a7bf8e32d 100644 --- a/hw/virtio/vhost-user-rng.c +++ b/hw/virtio/vhost-user-rng.c @@ -203,7 +203,7 @@ static void vu_rng_device_realize(DeviceState *dev, Error **errp) return; } - virtio_init(vdev, "vhost-user-rng", VIRTIO_ID_RNG, 0); + virtio_init(vdev, VIRTIO_ID_RNG, 0); rng->req_vq = virtio_add_queue(vdev, 4, vu_rng_handle_output); if (!rng->req_vq) { @@ -247,6 +247,12 @@ static void vu_rng_device_unrealize(DeviceState *dev) vhost_user_cleanup(&rng->vhost_user); } +static struct vhost_dev *vu_rng_get_vhost(VirtIODevice *vdev) +{ + VHostUserRNG *rng = VHOST_USER_RNG(vdev); + return &rng->vhost_dev; +} + static const VMStateDescription vu_rng_vmstate = { .name = "vhost-user-rng", .unmigratable = 1, @@ -272,6 +278,7 @@ static void vu_rng_class_init(ObjectClass *klass, void *data) vdc->set_status = vu_rng_set_status; vdc->guest_notifier_mask = vu_rng_guest_notifier_mask; vdc->guest_notifier_pending = vu_rng_guest_notifier_pending; + vdc->get_vhost = vu_rng_get_vhost; } static const TypeInfo vu_rng_info = { diff --git a/hw/virtio/vhost-user-scsi-pci.c b/hw/virtio/vhost-user-scsi-pci.c index d5343412a1..75882e3cf9 100644 --- a/hw/virtio/vhost-user-scsi-pci.c +++ b/hw/virtio/vhost-user-scsi-pci.c @@ -30,7 +30,7 @@ #include "hw/pci/msix.h" #include "hw/loader.h" #include "sysemu/kvm.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "qom/object.h" typedef struct VHostUserSCSIPCI VHostUserSCSIPCI; diff --git a/hw/virtio/vhost-user-vsock-pci.c b/hw/virtio/vhost-user-vsock-pci.c index 72a96199cd..e5a86e8013 100644 --- a/hw/virtio/vhost-user-vsock-pci.c +++ b/hw/virtio/vhost-user-vsock-pci.c @@ -10,7 +10,7 @@ #include "qemu/osdep.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "hw/qdev-properties.h" #include "hw/virtio/vhost-user-vsock.h" #include "qom/object.h" diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c index 52bd682c34..0f8ff99f85 100644 --- a/hw/virtio/vhost-user-vsock.c +++ b/hw/virtio/vhost-user-vsock.c @@ -107,7 +107,7 @@ static void vuv_device_realize(DeviceState *dev, Error **errp) return; } - vhost_vsock_common_realize(vdev, "vhost-user-vsock"); + vhost_vsock_common_realize(vdev); vhost_dev_set_config_notifier(&vvc->vhost_dev, &vsock_ops); diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index afd51f79b3..b040c1ad2b 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -489,6 +489,8 @@ static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg, return ret < 0 ? -saved_errno : -EIO; } + trace_vhost_user_write(msg->hdr.request, msg->hdr.flags); + return 0; } @@ -542,6 +544,8 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base, } } + trace_vhost_user_read(msg.hdr.request, msg.hdr.flags); + return 0; } @@ -1170,14 +1174,16 @@ static void vhost_user_host_notifier_free(VhostUserHostNotifier *n) n->unmap_addr = NULL; } -static void vhost_user_host_notifier_remove(VhostUserState *user, - VirtIODevice *vdev, int queue_idx) +/* + * clean-up function for notifier, will finally free the structure + * under rcu. + */ +static void vhost_user_host_notifier_remove(VhostUserHostNotifier *n, + VirtIODevice *vdev) { - VhostUserHostNotifier *n = &user->notifier[queue_idx]; - if (n->addr) { if (vdev) { - virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false); + virtio_queue_set_host_notifier_mr(vdev, n->idx, &n->mr, false); } assert(!n->unmap_addr); n->unmap_addr = n->addr; @@ -1221,6 +1227,15 @@ static int vhost_user_set_vring_enable(struct vhost_dev *dev, int enable) return 0; } +static VhostUserHostNotifier *fetch_notifier(VhostUserState *u, + int idx) +{ + if (idx >= u->notifiers->len) { + return NULL; + } + return g_ptr_array_index(u->notifiers, idx); +} + static int vhost_user_get_vring_base(struct vhost_dev *dev, struct vhost_vring_state *ring) { @@ -1233,7 +1248,10 @@ static int vhost_user_get_vring_base(struct vhost_dev *dev, }; struct vhost_user *u = dev->opaque; - vhost_user_host_notifier_remove(u->user, dev->vdev, ring->index); + VhostUserHostNotifier *n = fetch_notifier(u->user, ring->index); + if (n) { + vhost_user_host_notifier_remove(n, dev->vdev); + } ret = vhost_user_write(dev, &msg, NULL, 0); if (ret < 0) { @@ -1498,6 +1516,29 @@ static int vhost_user_slave_handle_config_change(struct vhost_dev *dev) return dev->config_ops->vhost_dev_config_notifier(dev); } +/* + * Fetch or create the notifier for a given idx. Newly created + * notifiers are added to the pointer array that tracks them. + */ +static VhostUserHostNotifier *fetch_or_create_notifier(VhostUserState *u, + int idx) +{ + VhostUserHostNotifier *n = NULL; + if (idx >= u->notifiers->len) { + g_ptr_array_set_size(u->notifiers, idx); + } + + n = g_ptr_array_index(u->notifiers, idx); + if (!n) { + n = g_new0(VhostUserHostNotifier, 1); + n->idx = idx; + g_ptr_array_insert(u->notifiers, idx, n); + trace_vhost_user_create_notifier(idx, n); + } + + return n; +} + static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev, VhostUserVringArea *area, int fd) @@ -1517,9 +1558,12 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev, return -EINVAL; } - n = &user->notifier[queue_idx]; - - vhost_user_host_notifier_remove(user, vdev, queue_idx); + /* + * Fetch notifier and invalidate any old data before setting up + * new mapped address. + */ + n = fetch_or_create_notifier(user, queue_idx); + vhost_user_host_notifier_remove(n, vdev); if (area->u64 & VHOST_USER_VRING_NOFD_MASK) { return 0; @@ -1945,14 +1989,15 @@ static int vhost_user_postcopy_notifier(NotifierWithReturn *notifier, static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, Error **errp) { - uint64_t features, protocol_features, ram_slots; + uint64_t features, ram_slots; struct vhost_user *u; + VhostUserState *vus = (VhostUserState *) opaque; int err; assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); u = g_new0(struct vhost_user, 1); - u->user = opaque; + u->user = vus; u->dev = dev; dev->opaque = u; @@ -1963,6 +2008,10 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, } if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) { + bool supports_f_config = vus->supports_config || + (dev->config_ops && dev->config_ops->vhost_dev_config_notifier); + uint64_t protocol_features; + dev->backend_features |= 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; err = vhost_user_get_u64(dev, VHOST_USER_GET_PROTOCOL_FEATURES, @@ -1972,19 +2021,34 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, return -EPROTO; } - dev->protocol_features = - protocol_features & VHOST_USER_PROTOCOL_FEATURE_MASK; + /* + * We will use all the protocol features we support - although + * we suppress F_CONFIG if we know QEMUs internal code can not support + * it. + */ + protocol_features &= VHOST_USER_PROTOCOL_FEATURE_MASK; - if (!dev->config_ops || !dev->config_ops->vhost_dev_config_notifier) { - /* Don't acknowledge CONFIG feature if device doesn't support it */ - dev->protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_CONFIG); - } else if (!(protocol_features & - (1ULL << VHOST_USER_PROTOCOL_F_CONFIG))) { - error_setg(errp, "Device expects VHOST_USER_PROTOCOL_F_CONFIG " - "but backend does not support it."); - return -EINVAL; + if (supports_f_config) { + if (!virtio_has_feature(protocol_features, + VHOST_USER_PROTOCOL_F_CONFIG)) { + error_setg(errp, "vhost-user device %s expecting " + "VHOST_USER_PROTOCOL_F_CONFIG but the vhost-user backend does " + "not support it.", dev->vdev->name); + return -EPROTO; + } + } else { + if (virtio_has_feature(protocol_features, + VHOST_USER_PROTOCOL_F_CONFIG)) { + warn_reportf_err(*errp, "vhost-user backend supports " + "VHOST_USER_PROTOCOL_F_CONFIG for " + "device %s but QEMU does not.", + dev->vdev->name); + protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_CONFIG); + } } + /* final set of protocol features */ + dev->protocol_features = protocol_features; err = vhost_user_set_protocol_features(dev, dev->protocol_features); if (err < 0) { error_setg_errno(errp, EPROTO, "vhost_backend_init failed"); @@ -2502,6 +2566,20 @@ static int vhost_user_set_inflight_fd(struct vhost_dev *dev, return vhost_user_write(dev, &msg, &inflight->fd, 1); } +static void vhost_user_state_destroy(gpointer data) +{ + VhostUserHostNotifier *n = (VhostUserHostNotifier *) data; + if (n) { + vhost_user_host_notifier_remove(n, NULL); + object_unparent(OBJECT(&n->mr)); + /* + * We can't free until vhost_user_host_notifier_remove has + * done it's thing so schedule the free with RCU. + */ + g_free_rcu(n, rcu); + } +} + bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp) { if (user->chr) { @@ -2510,23 +2588,18 @@ bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp) } user->chr = chr; user->memory_slots = 0; + user->notifiers = g_ptr_array_new_full(VIRTIO_QUEUE_MAX / 4, + &vhost_user_state_destroy); return true; } void vhost_user_cleanup(VhostUserState *user) { - int i; - VhostUserHostNotifier *n; - if (!user->chr) { return; } memory_region_transaction_begin(); - for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { - n = &user->notifier[i]; - vhost_user_host_notifier_remove(user, NULL, i); - object_unparent(OBJECT(&n->mr)); - } + user->notifiers = (GPtrArray *) g_ptr_array_free(user->notifiers, true); memory_region_transaction_commit(); user->chr = NULL; } diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index a30510ed17..66f054a12c 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -368,11 +368,18 @@ static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v) v->iova_range.last); } -static bool vhost_vdpa_one_time_request(struct vhost_dev *dev) +/* + * The use of this function is for requests that only need to be + * applied once. Typically such request occurs at the beginning + * of operation, and before setting up queues. It should not be + * used for request that performs operation until all queues are + * set, which would need to check dev->vq_index_end instead. + */ +static bool vhost_vdpa_first_dev(struct vhost_dev *dev) { struct vhost_vdpa *v = dev->opaque; - return v->index != 0; + return v->index == 0; } static int vhost_vdpa_get_dev_features(struct vhost_dev *dev, @@ -453,7 +460,7 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) vhost_vdpa_get_iova_range(v); - if (vhost_vdpa_one_time_request(dev)) { + if (!vhost_vdpa_first_dev(dev)) { return 0; } @@ -596,7 +603,7 @@ static int vhost_vdpa_memslots_limit(struct vhost_dev *dev) static int vhost_vdpa_set_mem_table(struct vhost_dev *dev, struct vhost_memory *mem) { - if (vhost_vdpa_one_time_request(dev)) { + if (!vhost_vdpa_first_dev(dev)) { return 0; } @@ -625,7 +632,7 @@ static int vhost_vdpa_set_features(struct vhost_dev *dev, struct vhost_vdpa *v = dev->opaque; int ret; - if (vhost_vdpa_one_time_request(dev)) { + if (!vhost_vdpa_first_dev(dev)) { return 0; } @@ -667,7 +674,7 @@ static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) features &= f; - if (vhost_vdpa_one_time_request(dev)) { + if (vhost_vdpa_first_dev(dev)) { r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); if (r) { return -EFAULT; @@ -1018,7 +1025,7 @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev) VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i); VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); struct vhost_vring_addr addr = { - .index = i, + .index = dev->vq_index + i, }; int r; bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err); @@ -1120,7 +1127,7 @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, struct vhost_log *log) { struct vhost_vdpa *v = dev->opaque; - if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) { + if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { return 0; } @@ -1172,11 +1179,11 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, struct vhost_vring_state *ring) { struct vhost_vdpa *v = dev->opaque; + int vdpa_idx = ring->index - dev->vq_index; int ret; if (v->shadow_vqs_enabled) { - VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, - ring->index); + VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); /* * Setting base as last used idx, so destination will see as available @@ -1242,7 +1249,7 @@ static int vhost_vdpa_get_features(struct vhost_dev *dev, static int vhost_vdpa_set_owner(struct vhost_dev *dev) { - if (vhost_vdpa_one_time_request(dev)) { + if (!vhost_vdpa_first_dev(dev)) { return 0; } diff --git a/hw/virtio/vhost-vsock-common.c b/hw/virtio/vhost-vsock-common.c index ed706681ac..7394818e00 100644 --- a/hw/virtio/vhost-vsock-common.c +++ b/hw/virtio/vhost-vsock-common.c @@ -224,12 +224,11 @@ int vhost_vsock_common_post_load(void *opaque, int version_id) return 0; } -void vhost_vsock_common_realize(VirtIODevice *vdev, const char *name) +void vhost_vsock_common_realize(VirtIODevice *vdev) { VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); - virtio_init(vdev, name, VIRTIO_ID_VSOCK, - sizeof(struct virtio_vsock_config)); + virtio_init(vdev, VIRTIO_ID_VSOCK, sizeof(struct virtio_vsock_config)); /* Receive and transmit queues belong to vhost */ vvc->recv_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, @@ -259,6 +258,12 @@ void vhost_vsock_common_unrealize(VirtIODevice *vdev) virtio_cleanup(vdev); } +static struct vhost_dev *vhost_vsock_common_get_vhost(VirtIODevice *vdev) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + return &vvc->vhost_dev; +} + static Property vhost_vsock_common_properties[] = { DEFINE_PROP_ON_OFF_AUTO("seqpacket", VHostVSockCommon, seqpacket, ON_OFF_AUTO_AUTO), @@ -274,6 +279,7 @@ static void vhost_vsock_common_class_init(ObjectClass *klass, void *data) set_bit(DEVICE_CATEGORY_MISC, dc->categories); vdc->guest_notifier_mask = vhost_vsock_common_guest_notifier_mask; vdc->guest_notifier_pending = vhost_vsock_common_guest_notifier_pending; + vdc->get_vhost = vhost_vsock_common_get_vhost; } static const TypeInfo vhost_vsock_common_info = { diff --git a/hw/virtio/vhost-vsock-pci.c b/hw/virtio/vhost-vsock-pci.c index 205da8d1f5..9f34414d38 100644 --- a/hw/virtio/vhost-vsock-pci.c +++ b/hw/virtio/vhost-vsock-pci.c @@ -13,7 +13,7 @@ #include "qemu/osdep.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "hw/qdev-properties.h" #include "hw/virtio/vhost-vsock.h" #include "qemu/module.h" diff --git a/hw/virtio/vhost-vsock.c b/hw/virtio/vhost-vsock.c index 714046210b..0338de892f 100644 --- a/hw/virtio/vhost-vsock.c +++ b/hw/virtio/vhost-vsock.c @@ -169,7 +169,7 @@ static void vhost_vsock_device_realize(DeviceState *dev, Error **errp) } } - vhost_vsock_common_realize(vdev, "vhost-vsock"); + vhost_vsock_common_realize(vdev); ret = vhost_dev_init(&vvc->vhost_dev, (void *)(uintptr_t)vhostfd, VHOST_BACKEND_TYPE_KERNEL, 0, errp); diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 2bc72c27c5..dd3263df56 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1738,6 +1738,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) /* should only be called after backend is connected */ assert(hdev->vhost_ops); + vdev->vhost_started = true; hdev->started = true; hdev->vdev = vdev; @@ -1810,7 +1811,7 @@ fail_vq: fail_mem: fail_features: - + vdev->vhost_started = false; hdev->started = false; return r; } @@ -1841,6 +1842,7 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev) } vhost_log_put(hdev, true); hdev->started = false; + vdev->vhost_started = false; hdev->vdev = NULL; } diff --git a/hw/virtio/virtio-9p-pci.c b/hw/virtio/virtio-9p-pci.c index e07adcd9ea..94c14f0b98 100644 --- a/hw/virtio/virtio-9p-pci.c +++ b/hw/virtio/virtio-9p-pci.c @@ -15,7 +15,7 @@ #include "qemu/osdep.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "hw/9pfs/virtio-9p.h" #include "hw/qdev-properties.h" #include "qemu/module.h" diff --git a/hw/virtio/virtio-balloon-pci.c b/hw/virtio/virtio-balloon-pci.c index 79a3ba979a..ce2645ba71 100644 --- a/hw/virtio/virtio-balloon-pci.c +++ b/hw/virtio/virtio-balloon-pci.c @@ -14,7 +14,7 @@ #include "qemu/osdep.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "hw/qdev-properties.h" #include "hw/virtio/virtio-balloon.h" #include "qapi/error.h" diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 8f1b38ef5c..73ac5eb675 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -882,8 +882,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) VirtIOBalloon *s = VIRTIO_BALLOON(dev); int ret; - virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON, - virtio_balloon_config_size(s)); + virtio_init(vdev, VIRTIO_ID_BALLOON, virtio_balloon_config_size(s)); ret = qemu_add_balloon_handler(virtio_balloon_to_target, virtio_balloon_stat, s); diff --git a/hw/virtio/virtio-blk-pci.c b/hw/virtio/virtio-blk-pci.c index 9d5795810c..9743bee965 100644 --- a/hw/virtio/virtio-blk-pci.c +++ b/hw/virtio/virtio-blk-pci.c @@ -19,7 +19,7 @@ #include "hw/qdev-properties.h" #include "hw/virtio/virtio-blk.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "qapi/error.h" #include "qemu/module.h" #include "qom/object.h" diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c index 0f69d1c742..d7ec023adf 100644 --- a/hw/virtio/virtio-bus.c +++ b/hw/virtio/virtio-bus.c @@ -78,17 +78,23 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) return; } - vdev_has_iommu = virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); - if (klass->get_dma_as != NULL && has_iommu) { + vdev->dma_as = &address_space_memory; + if (has_iommu) { + vdev_has_iommu = virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); + /* + * Present IOMMU_PLATFORM to the driver iff iommu_plattform=on and + * device operational. If the driver does not accept IOMMU_PLATFORM + * we fail the device. + */ virtio_add_feature(&vdev->host_features, VIRTIO_F_IOMMU_PLATFORM); - vdev->dma_as = klass->get_dma_as(qbus->parent); - if (!vdev_has_iommu && vdev->dma_as != &address_space_memory) { - error_setg(errp, + if (klass->get_dma_as) { + vdev->dma_as = klass->get_dma_as(qbus->parent); + if (!vdev_has_iommu && vdev->dma_as != &address_space_memory) { + error_setg(errp, "iommu_platform=true is not supported by the device"); - return; + return; + } } - } else { - vdev->dma_as = &address_space_memory; } } diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c index dcd80b904d..c3829e7498 100644 --- a/hw/virtio/virtio-crypto.c +++ b/hw/virtio/virtio-crypto.c @@ -242,7 +242,7 @@ static void virtio_crypto_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) } out_num = elem->out_num; - out_iov_copy = g_memdup(elem->out_sg, sizeof(out_iov[0]) * out_num); + out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num); out_iov = out_iov_copy; in_num = elem->in_num; @@ -605,11 +605,11 @@ virtio_crypto_handle_request(VirtIOCryptoReq *request) } out_num = elem->out_num; - out_iov_copy = g_memdup(elem->out_sg, sizeof(out_iov[0]) * out_num); + out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num); out_iov = out_iov_copy; in_num = elem->in_num; - in_iov_copy = g_memdup(elem->in_sg, sizeof(in_iov[0]) * in_num); + in_iov_copy = g_memdup2(elem->in_sg, sizeof(in_iov[0]) * in_num); in_iov = in_iov_copy; if (unlikely(iov_to_buf(out_iov, out_num, 0, &req, sizeof(req)) @@ -810,7 +810,7 @@ static void virtio_crypto_device_realize(DeviceState *dev, Error **errp) return; } - virtio_init(vdev, "virtio-crypto", VIRTIO_ID_CRYPTO, vcrypto->config_size); + virtio_init(vdev, VIRTIO_ID_CRYPTO, vcrypto->config_size); vcrypto->curr_queues = 1; vcrypto->vqs = g_new0(VirtIOCryptoQueue, vcrypto->max_queues); for (i = 0; i < vcrypto->max_queues; i++) { @@ -961,6 +961,15 @@ static bool virtio_crypto_guest_notifier_pending(VirtIODevice *vdev, int idx) return cryptodev_vhost_virtqueue_pending(vdev, queue, idx); } +static struct vhost_dev *virtio_crypto_get_vhost(VirtIODevice *vdev) +{ + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); + CryptoDevBackend *b = vcrypto->cryptodev; + CryptoDevBackendClient *cc = b->conf.peers.ccs[0]; + CryptoDevBackendVhost *vhost_crypto = cryptodev_get_vhost(cc, b, 0); + return &vhost_crypto->dev; +} + static void virtio_crypto_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -977,6 +986,7 @@ static void virtio_crypto_class_init(ObjectClass *klass, void *data) vdc->set_status = virtio_crypto_set_status; vdc->guest_notifier_mask = virtio_crypto_guest_notifier_mask; vdc->guest_notifier_pending = virtio_crypto_guest_notifier_pending; + vdc->get_vhost = virtio_crypto_get_vhost; } static void virtio_crypto_instance_init(Object *obj) diff --git a/hw/virtio/virtio-input-host-pci.c b/hw/virtio/virtio-input-host-pci.c index 0ac360de4f..cf8a9cf9e8 100644 --- a/hw/virtio/virtio-input-host-pci.c +++ b/hw/virtio/virtio-input-host-pci.c @@ -8,7 +8,7 @@ #include "qemu/osdep.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "hw/virtio/virtio-input.h" #include "qemu/module.h" #include "qom/object.h" diff --git a/hw/virtio/virtio-input-pci.c b/hw/virtio/virtio-input-pci.c index 48e9ff38e2..a9d0992389 100644 --- a/hw/virtio/virtio-input-pci.c +++ b/hw/virtio/virtio-input-pci.c @@ -8,7 +8,7 @@ #include "qemu/osdep.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "hw/qdev-properties.h" #include "hw/virtio/virtio-input.h" #include "qemu/module.h" diff --git a/hw/virtio/virtio-iommu-pci.c b/hw/virtio/virtio-iommu-pci.c index 6a1df7fe50..844d647704 100644 --- a/hw/virtio/virtio-iommu-pci.c +++ b/hw/virtio/virtio-iommu-pci.c @@ -11,7 +11,7 @@ #include "qemu/osdep.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "hw/virtio/virtio-iommu.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 57c09d98a9..2597e166f9 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -1033,8 +1033,7 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOIOMMU *s = VIRTIO_IOMMU(dev); - virtio_init(vdev, "virtio-iommu", VIRTIO_ID_IOMMU, - sizeof(struct virtio_iommu_config)); + virtio_init(vdev, VIRTIO_ID_IOMMU, sizeof(struct virtio_iommu_config)); memset(s->iommu_pcibus_by_bus_num, 0, sizeof(s->iommu_pcibus_by_bus_num)); diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index 5aca408726..30d03e987a 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -867,8 +867,7 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) vmem->block_size; vmem->bitmap = bitmap_new(vmem->bitmap_size); - virtio_init(vdev, TYPE_VIRTIO_MEM, VIRTIO_ID_MEM, - sizeof(struct virtio_mem_config)); + virtio_init(vdev, VIRTIO_ID_MEM, sizeof(struct virtio_mem_config)); vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request); host_memory_backend_set_mapped(vmem->memdev, true); diff --git a/hw/virtio/virtio-net-pci.c b/hw/virtio/virtio-net-pci.c index aa0b3caecb..e03543a70a 100644 --- a/hw/virtio/virtio-net-pci.c +++ b/hw/virtio/virtio-net-pci.c @@ -19,7 +19,7 @@ #include "hw/qdev-properties.h" #include "hw/virtio/virtio-net.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "qapi/error.h" #include "qemu/module.h" #include "qom/object.h" diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 7cf1231c1c..0566ad7d00 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -33,11 +33,12 @@ #include "hw/pci/msix.h" #include "hw/loader.h" #include "sysemu/kvm.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "qemu/range.h" #include "hw/virtio/virtio-bus.h" #include "qapi/visitor.h" #include "sysemu/replay.h" +#include "trace.h" #define VIRTIO_PCI_REGION_SIZE(dev) VIRTIO_PCI_CONFIG_OFF(msix_present(dev)) @@ -1380,6 +1381,7 @@ static void virtio_pci_notify_write(void *opaque, hwaddr addr, unsigned queue = addr / virtio_pci_queue_mem_mult(proxy); if (vdev != NULL && queue < VIRTIO_QUEUE_MAX) { + trace_virtio_pci_notify_write(addr, val, size); virtio_queue_notify(vdev, queue); } } @@ -1393,6 +1395,7 @@ static void virtio_pci_notify_write_pio(void *opaque, hwaddr addr, unsigned queue = val; if (vdev != NULL && queue < VIRTIO_QUEUE_MAX) { + trace_virtio_pci_notify_write_pio(addr, val, size); virtio_queue_notify(vdev, queue); } } diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c index 5dd21c2c44..a1abfe0e1b 100644 --- a/hw/virtio/virtio-pmem.c +++ b/hw/virtio/virtio-pmem.c @@ -122,8 +122,7 @@ static void virtio_pmem_realize(DeviceState *dev, Error **errp) } host_memory_backend_set_mapped(pmem->memdev, true); - virtio_init(vdev, TYPE_VIRTIO_PMEM, VIRTIO_ID_PMEM, - sizeof(struct virtio_pmem_config)); + virtio_init(vdev, VIRTIO_ID_PMEM, sizeof(struct virtio_pmem_config)); pmem->rq_vq = virtio_add_queue(vdev, 128, virtio_pmem_flush); } diff --git a/hw/virtio/virtio-rng-pci.c b/hw/virtio/virtio-rng-pci.c index c1f916268b..151ece6f94 100644 --- a/hw/virtio/virtio-rng-pci.c +++ b/hw/virtio/virtio-rng-pci.c @@ -11,7 +11,7 @@ #include "qemu/osdep.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "hw/virtio/virtio-rng.h" #include "qapi/error.h" #include "qemu/module.h" diff --git a/hw/virtio/virtio-rng.c b/hw/virtio/virtio-rng.c index cc8e9f775d..7e12fc03bf 100644 --- a/hw/virtio/virtio-rng.c +++ b/hw/virtio/virtio-rng.c @@ -215,7 +215,7 @@ static void virtio_rng_device_realize(DeviceState *dev, Error **errp) return; } - virtio_init(vdev, "virtio-rng", VIRTIO_ID_RNG, 0); + virtio_init(vdev, VIRTIO_ID_RNG, 0); vrng->vq = virtio_add_queue(vdev, 8, handle_input); vrng->quota_remaining = vrng->conf.max_bytes; diff --git a/hw/virtio/virtio-scsi-pci.c b/hw/virtio/virtio-scsi-pci.c index 97fab74236..e8e3442f38 100644 --- a/hw/virtio/virtio-scsi-pci.c +++ b/hw/virtio/virtio-scsi-pci.c @@ -18,7 +18,7 @@ #include "hw/qdev-properties.h" #include "hw/virtio/virtio-scsi.h" #include "qemu/module.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "qom/object.h" typedef struct VirtIOSCSIPCI VirtIOSCSIPCI; diff --git a/hw/virtio/virtio-serial-pci.c b/hw/virtio/virtio-serial-pci.c index 35bcd961c9..cea31adcc4 100644 --- a/hw/virtio/virtio-serial-pci.c +++ b/hw/virtio/virtio-serial-pci.c @@ -20,7 +20,7 @@ #include "hw/qdev-properties.h" #include "hw/virtio/virtio-serial.h" #include "qemu/module.h" -#include "virtio-pci.h" +#include "hw/virtio/virtio-pci.h" #include "qom/object.h" typedef struct VirtIOSerialPCI VirtIOSerialPCI; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 67a873f54a..5d607aeaa0 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -132,6 +132,56 @@ struct VirtQueue QLIST_ENTRY(VirtQueue) node; }; +const char *virtio_device_names[] = { + [VIRTIO_ID_NET] = "virtio-net", + [VIRTIO_ID_BLOCK] = "virtio-blk", + [VIRTIO_ID_CONSOLE] = "virtio-serial", + [VIRTIO_ID_RNG] = "virtio-rng", + [VIRTIO_ID_BALLOON] = "virtio-balloon", + [VIRTIO_ID_IOMEM] = "virtio-iomem", + [VIRTIO_ID_RPMSG] = "virtio-rpmsg", + [VIRTIO_ID_SCSI] = "virtio-scsi", + [VIRTIO_ID_9P] = "virtio-9p", + [VIRTIO_ID_MAC80211_WLAN] = "virtio-mac-wlan", + [VIRTIO_ID_RPROC_SERIAL] = "virtio-rproc-serial", + [VIRTIO_ID_CAIF] = "virtio-caif", + [VIRTIO_ID_MEMORY_BALLOON] = "virtio-mem-balloon", + [VIRTIO_ID_GPU] = "virtio-gpu", + [VIRTIO_ID_CLOCK] = "virtio-clk", + [VIRTIO_ID_INPUT] = "virtio-input", + [VIRTIO_ID_VSOCK] = "vhost-vsock", + [VIRTIO_ID_CRYPTO] = "virtio-crypto", + [VIRTIO_ID_SIGNAL_DIST] = "virtio-signal", + [VIRTIO_ID_PSTORE] = "virtio-pstore", + [VIRTIO_ID_IOMMU] = "virtio-iommu", + [VIRTIO_ID_MEM] = "virtio-mem", + [VIRTIO_ID_SOUND] = "virtio-sound", + [VIRTIO_ID_FS] = "virtio-user-fs", + [VIRTIO_ID_PMEM] = "virtio-pmem", + [VIRTIO_ID_RPMB] = "virtio-rpmb", + [VIRTIO_ID_MAC80211_HWSIM] = "virtio-mac-hwsim", + [VIRTIO_ID_VIDEO_ENCODER] = "virtio-vid-encoder", + [VIRTIO_ID_VIDEO_DECODER] = "virtio-vid-decoder", + [VIRTIO_ID_SCMI] = "virtio-scmi", + [VIRTIO_ID_NITRO_SEC_MOD] = "virtio-nitro-sec-mod", + [VIRTIO_ID_I2C_ADAPTER] = "vhost-user-i2c", + [VIRTIO_ID_WATCHDOG] = "virtio-watchdog", + [VIRTIO_ID_CAN] = "virtio-can", + [VIRTIO_ID_DMABUF] = "virtio-dmabuf", + [VIRTIO_ID_PARAM_SERV] = "virtio-param-serv", + [VIRTIO_ID_AUDIO_POLICY] = "virtio-audio-pol", + [VIRTIO_ID_BT] = "virtio-bluetooth", + [VIRTIO_ID_GPIO] = "virtio-gpio" +}; + +static const char *virtio_id_to_name(uint16_t device_id) +{ + assert(device_id < G_N_ELEMENTS(virtio_device_names)); + const char *name = virtio_device_names[device_id]; + assert(name != NULL); + return name; +} + /* Called within call_rcu(). */ static void virtio_free_region_cache(VRingMemoryRegionCaches *caches) { @@ -3207,8 +3257,7 @@ void virtio_instance_init_common(Object *proxy_obj, void *data, qdev_alias_all_properties(vdev, proxy_obj); } -void virtio_init(VirtIODevice *vdev, const char *name, - uint16_t device_id, size_t config_size) +void virtio_init(VirtIODevice *vdev, uint16_t device_id, size_t config_size) { BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); @@ -3222,6 +3271,7 @@ void virtio_init(VirtIODevice *vdev, const char *name, vdev->start_on_kick = false; vdev->started = false; + vdev->vhost_started = false; vdev->device_id = device_id; vdev->status = 0; qatomic_set(&vdev->isr, 0); @@ -3237,7 +3287,7 @@ void virtio_init(VirtIODevice *vdev, const char *name, vdev->vq[i].host_notifier_enabled = false; } - vdev->name = name; + vdev->name = virtio_id_to_name(device_id); vdev->config_len = config_size; if (vdev->config_len) { vdev->config = g_malloc0(config_size); diff --git a/include/hw/acpi/cxl.h b/include/hw/acpi/cxl.h new file mode 100644 index 0000000000..0c496538c0 --- /dev/null +++ b/include/hw/acpi/cxl.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2020 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see . + */ + +#ifndef HW_ACPI_CXL_H +#define HW_ACPI_CXL_H + +#include "hw/acpi/bios-linker-loader.h" + +void cxl_build_cedt(MachineState *ms, GArray *table_offsets, GArray *table_data, + BIOSLinker *linker, const char *oem_id, + const char *oem_table_id); +void build_cxl_osc_method(Aml *dev); + +#endif diff --git a/include/hw/boards.h b/include/hw/boards.h index 7b416c9787..fa57bac4fb 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -269,6 +269,7 @@ struct MachineClass { bool ignore_boot_device_suffixes; bool smbus_no_migration_support; bool nvdimm_supported; + bool cxl_supported; bool numa_mem_supported; bool auto_enable_numa; SMPCompatProps smp_props; @@ -359,6 +360,7 @@ struct MachineState { CPUArchIdList *possible_cpus; CpuTopology smp; struct NVDIMMState *nvdimms_state; + struct CXLState *cxl_devices_state; struct NumaState *numa_state; }; diff --git a/include/hw/cxl/cxl.h b/include/hw/cxl/cxl.h new file mode 100644 index 0000000000..21d28ca110 --- /dev/null +++ b/include/hw/cxl/cxl.h @@ -0,0 +1,61 @@ +/* + * QEMU CXL Support + * + * Copyright (c) 2020 Intel + * + * This work is licensed under the terms of the GNU GPL, version 2. See the + * COPYING file in the top-level directory. + */ + +#ifndef CXL_H +#define CXL_H + + +#include "qapi/qapi-types-machine.h" +#include "hw/pci/pci_bridge.h" +#include "hw/pci/pci_host.h" +#include "cxl_pci.h" +#include "cxl_component.h" +#include "cxl_device.h" + +#define CXL_COMPONENT_REG_BAR_IDX 0 +#define CXL_DEVICE_REG_BAR_IDX 2 + +#define CXL_WINDOW_MAX 10 + +typedef struct CXLFixedWindow { + uint64_t size; + char **targets; + struct PXBDev *target_hbs[8]; + uint8_t num_targets; + uint8_t enc_int_ways; + uint8_t enc_int_gran; + /* Todo: XOR based interleaving */ + MemoryRegion mr; + hwaddr base; +} CXLFixedWindow; + +typedef struct CXLState { + bool is_enabled; + MemoryRegion host_mr; + unsigned int next_mr_idx; + GList *fixed_windows; +} CXLState; + +struct CXLHost { + PCIHostState parent_obj; + + CXLComponentState cxl_cstate; +}; + +#define TYPE_PXB_CXL_HOST "pxb-cxl-host" +OBJECT_DECLARE_SIMPLE_TYPE(CXLHost, PXB_CXL_HOST) + +void cxl_fixed_memory_window_config(MachineState *ms, + CXLFixedMemoryWindowOptions *object, + Error **errp); +void cxl_fixed_memory_window_link_targets(Error **errp); + +extern const MemoryRegionOps cfmws_ops; + +#endif diff --git a/include/hw/cxl/cxl_component.h b/include/hw/cxl/cxl_component.h new file mode 100644 index 0000000000..70b5018156 --- /dev/null +++ b/include/hw/cxl/cxl_component.h @@ -0,0 +1,223 @@ +/* + * QEMU CXL Component + * + * Copyright (c) 2020 Intel + * + * This work is licensed under the terms of the GNU GPL, version 2. See the + * COPYING file in the top-level directory. + */ + +#ifndef CXL_COMPONENT_H +#define CXL_COMPONENT_H + +/* CXL 2.0 - 8.2.4 */ +#define CXL2_COMPONENT_IO_REGION_SIZE 0x1000 +#define CXL2_COMPONENT_CM_REGION_SIZE 0x1000 +#define CXL2_COMPONENT_BLOCK_SIZE 0x10000 + +#include "qemu/compiler.h" +#include "qemu/range.h" +#include "qemu/typedefs.h" +#include "hw/register.h" + +enum reg_type { + CXL2_DEVICE, + CXL2_TYPE3_DEVICE, + CXL2_LOGICAL_DEVICE, + CXL2_ROOT_PORT, + CXL2_UPSTREAM_PORT, + CXL2_DOWNSTREAM_PORT +}; + +/* + * Capability registers are defined at the top of the CXL.cache/mem region and + * are packed. For our purposes we will always define the caps in the same + * order. + * CXL 2.0 - 8.2.5 Table 142 for details. + */ + +/* CXL 2.0 - 8.2.5.1 */ +REG32(CXL_CAPABILITY_HEADER, 0) + FIELD(CXL_CAPABILITY_HEADER, ID, 0, 16) + FIELD(CXL_CAPABILITY_HEADER, VERSION, 16, 4) + FIELD(CXL_CAPABILITY_HEADER, CACHE_MEM_VERSION, 20, 4) + FIELD(CXL_CAPABILITY_HEADER, ARRAY_SIZE, 24, 8) + +#define CXLx_CAPABILITY_HEADER(type, offset) \ + REG32(CXL_##type##_CAPABILITY_HEADER, offset) \ + FIELD(CXL_##type##_CAPABILITY_HEADER, ID, 0, 16) \ + FIELD(CXL_##type##_CAPABILITY_HEADER, VERSION, 16, 4) \ + FIELD(CXL_##type##_CAPABILITY_HEADER, PTR, 20, 12) +CXLx_CAPABILITY_HEADER(RAS, 0x4) +CXLx_CAPABILITY_HEADER(LINK, 0x8) +CXLx_CAPABILITY_HEADER(HDM, 0xc) +CXLx_CAPABILITY_HEADER(EXTSEC, 0x10) +CXLx_CAPABILITY_HEADER(SNOOP, 0x14) + +/* + * Capability structures contain the actual registers that the CXL component + * implements. Some of these are specific to certain types of components, but + * this implementation leaves enough space regardless. + */ +/* 8.2.5.9 - CXL RAS Capability Structure */ + +/* Give ample space for caps before this */ +#define CXL_RAS_REGISTERS_OFFSET 0x80 +#define CXL_RAS_REGISTERS_SIZE 0x58 +REG32(CXL_RAS_UNC_ERR_STATUS, CXL_RAS_REGISTERS_OFFSET) +REG32(CXL_RAS_UNC_ERR_MASK, CXL_RAS_REGISTERS_OFFSET + 0x4) +REG32(CXL_RAS_UNC_ERR_SEVERITY, CXL_RAS_REGISTERS_OFFSET + 0x8) +REG32(CXL_RAS_COR_ERR_STATUS, CXL_RAS_REGISTERS_OFFSET + 0xc) +REG32(CXL_RAS_COR_ERR_MASK, CXL_RAS_REGISTERS_OFFSET + 0x10) +REG32(CXL_RAS_ERR_CAP_CTRL, CXL_RAS_REGISTERS_OFFSET + 0x14) +/* Offset 0x18 - 0x58 reserved for RAS logs */ + +/* 8.2.5.10 - CXL Security Capability Structure */ +#define CXL_SEC_REGISTERS_OFFSET \ + (CXL_RAS_REGISTERS_OFFSET + CXL_RAS_REGISTERS_SIZE) +#define CXL_SEC_REGISTERS_SIZE 0 /* We don't implement 1.1 downstream ports */ + +/* 8.2.5.11 - CXL Link Capability Structure */ +#define CXL_LINK_REGISTERS_OFFSET \ + (CXL_SEC_REGISTERS_OFFSET + CXL_SEC_REGISTERS_SIZE) +#define CXL_LINK_REGISTERS_SIZE 0x38 + +/* 8.2.5.12 - CXL HDM Decoder Capability Structure */ +#define HDM_DECODE_MAX 10 /* 8.2.5.12.1 */ +#define CXL_HDM_REGISTERS_OFFSET \ + (CXL_LINK_REGISTERS_OFFSET + CXL_LINK_REGISTERS_SIZE) +#define CXL_HDM_REGISTERS_SIZE (0x10 + 0x20 * HDM_DECODE_MAX) +#define HDM_DECODER_INIT(n) \ + REG32(CXL_HDM_DECODER##n##_BASE_LO, \ + CXL_HDM_REGISTERS_OFFSET + (0x20 * n) + 0x10) \ + FIELD(CXL_HDM_DECODER##n##_BASE_LO, L, 28, 4) \ + REG32(CXL_HDM_DECODER##n##_BASE_HI, \ + CXL_HDM_REGISTERS_OFFSET + (0x20 * n) + 0x14) \ + REG32(CXL_HDM_DECODER##n##_SIZE_LO, \ + CXL_HDM_REGISTERS_OFFSET + (0x20 * n) + 0x18) \ + REG32(CXL_HDM_DECODER##n##_SIZE_HI, \ + CXL_HDM_REGISTERS_OFFSET + (0x20 * n) + 0x1C) \ + REG32(CXL_HDM_DECODER##n##_CTRL, \ + CXL_HDM_REGISTERS_OFFSET + (0x20 * n) + 0x20) \ + FIELD(CXL_HDM_DECODER##n##_CTRL, IG, 0, 4) \ + FIELD(CXL_HDM_DECODER##n##_CTRL, IW, 4, 4) \ + FIELD(CXL_HDM_DECODER##n##_CTRL, LOCK_ON_COMMIT, 8, 1) \ + FIELD(CXL_HDM_DECODER##n##_CTRL, COMMIT, 9, 1) \ + FIELD(CXL_HDM_DECODER##n##_CTRL, COMMITTED, 10, 1) \ + FIELD(CXL_HDM_DECODER##n##_CTRL, ERR, 11, 1) \ + FIELD(CXL_HDM_DECODER##n##_CTRL, TYPE, 12, 1) \ + REG32(CXL_HDM_DECODER##n##_TARGET_LIST_LO, \ + CXL_HDM_REGISTERS_OFFSET + (0x20 * n) + 0x24) \ + REG32(CXL_HDM_DECODER##n##_TARGET_LIST_HI, \ + CXL_HDM_REGISTERS_OFFSET + (0x20 * n) + 0x28) + +REG32(CXL_HDM_DECODER_CAPABILITY, CXL_HDM_REGISTERS_OFFSET) + FIELD(CXL_HDM_DECODER_CAPABILITY, DECODER_COUNT, 0, 4) + FIELD(CXL_HDM_DECODER_CAPABILITY, TARGET_COUNT, 4, 4) + FIELD(CXL_HDM_DECODER_CAPABILITY, INTERLEAVE_256B, 8, 1) + FIELD(CXL_HDM_DECODER_CAPABILITY, INTERLEAVE_4K, 9, 1) + FIELD(CXL_HDM_DECODER_CAPABILITY, POISON_ON_ERR_CAP, 10, 1) +REG32(CXL_HDM_DECODER_GLOBAL_CONTROL, CXL_HDM_REGISTERS_OFFSET + 4) + FIELD(CXL_HDM_DECODER_GLOBAL_CONTROL, POISON_ON_ERR_EN, 0, 1) + FIELD(CXL_HDM_DECODER_GLOBAL_CONTROL, HDM_DECODER_ENABLE, 1, 1) + +HDM_DECODER_INIT(0); + +/* 8.2.5.13 - CXL Extended Security Capability Structure (Root complex only) */ +#define EXTSEC_ENTRY_MAX 256 +#define CXL_EXTSEC_REGISTERS_OFFSET \ + (CXL_HDM_REGISTERS_OFFSET + CXL_HDM_REGISTERS_SIZE) +#define CXL_EXTSEC_REGISTERS_SIZE (8 * EXTSEC_ENTRY_MAX + 4) + +/* 8.2.5.14 - CXL IDE Capability Structure */ +#define CXL_IDE_REGISTERS_OFFSET \ + (CXL_EXTSEC_REGISTERS_OFFSET + CXL_EXTSEC_REGISTERS_SIZE) +#define CXL_IDE_REGISTERS_SIZE 0x20 + +/* 8.2.5.15 - CXL Snoop Filter Capability Structure */ +#define CXL_SNOOP_REGISTERS_OFFSET \ + (CXL_IDE_REGISTERS_OFFSET + CXL_IDE_REGISTERS_SIZE) +#define CXL_SNOOP_REGISTERS_SIZE 0x8 + +QEMU_BUILD_BUG_MSG((CXL_SNOOP_REGISTERS_OFFSET + CXL_SNOOP_REGISTERS_SIZE) >= 0x1000, + "No space for registers"); + +typedef struct component_registers { + /* + * Main memory region to be registered with QEMU core. + */ + MemoryRegion component_registers; + + /* + * 8.2.4 Table 141: + * 0x0000 - 0x0fff CXL.io registers + * 0x1000 - 0x1fff CXL.cache and CXL.mem + * 0x2000 - 0xdfff Implementation specific + * 0xe000 - 0xe3ff CXL ARB/MUX registers + * 0xe400 - 0xffff RSVD + */ + uint32_t io_registers[CXL2_COMPONENT_IO_REGION_SIZE >> 2]; + MemoryRegion io; + + uint32_t cache_mem_registers[CXL2_COMPONENT_CM_REGION_SIZE >> 2]; + uint32_t cache_mem_regs_write_mask[CXL2_COMPONENT_CM_REGION_SIZE >> 2]; + MemoryRegion cache_mem; + + MemoryRegion impl_specific; + MemoryRegion arb_mux; + MemoryRegion rsvd; + + /* special_ops is used for any component that needs any specific handling */ + MemoryRegionOps *special_ops; +} ComponentRegisters; + +/* + * A CXL component represents all entities in a CXL hierarchy. This includes, + * host bridges, root ports, upstream/downstream switch ports, and devices + */ +typedef struct cxl_component { + ComponentRegisters crb; + union { + struct { + Range dvsecs[CXL20_MAX_DVSEC]; + uint16_t dvsec_offset; + struct PCIDevice *pdev; + }; + }; +} CXLComponentState; + +void cxl_component_register_block_init(Object *obj, + CXLComponentState *cxl_cstate, + const char *type); +void cxl_component_register_init_common(uint32_t *reg_state, + uint32_t *write_msk, + enum reg_type type); + +void cxl_component_create_dvsec(CXLComponentState *cxl_cstate, + enum reg_type cxl_dev_type, uint16_t length, + uint16_t type, uint8_t rev, uint8_t *body); + +static inline int cxl_decoder_count_enc(int count) +{ + switch (count) { + case 1: return 0; + case 2: return 1; + case 4: return 2; + case 6: return 3; + case 8: return 4; + case 10: return 5; + } + return 0; +} + +uint8_t cxl_interleave_ways_enc(int iw, Error **errp); +uint8_t cxl_interleave_granularity_enc(uint64_t gran, Error **errp); + +static inline hwaddr cxl_decode_ig(int ig) +{ + return 1 << (ig + 8); +} + +CXLComponentState *cxl_get_hb_cstate(PCIHostState *hb); + +#endif diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h new file mode 100644 index 0000000000..1e141b6621 --- /dev/null +++ b/include/hw/cxl/cxl_device.h @@ -0,0 +1,268 @@ +/* + * QEMU CXL Devices + * + * Copyright (c) 2020 Intel + * + * This work is licensed under the terms of the GNU GPL, version 2. See the + * COPYING file in the top-level directory. + */ + +#ifndef CXL_DEVICE_H +#define CXL_DEVICE_H + +#include "hw/register.h" + +/* + * The following is how a CXL device's Memory Device registers are laid out. + * The only requirement from the spec is that the capabilities array and the + * capability headers start at offset 0 and are contiguously packed. The headers + * themselves provide offsets to the register fields. For this emulation, the + * actual registers * will start at offset 0x80 (m == 0x80). No secondary + * mailbox is implemented which means that the offset of the start of the + * mailbox payload (n) is given by + * n = m + sizeof(mailbox registers) + sizeof(device registers). + * + * +---------------------------------+ + * | | + * | Memory Device Registers | + * | | + * n + PAYLOAD_SIZE_MAX ----------------------------------- + * ^ | | + * | | | + * | | | + * | | | + * | | | + * | | Mailbox Payload | + * | | | + * | | | + * | | | + * n ----------------------------------- + * ^ | Mailbox Registers | + * | | | + * | ----------------------------------- + * | | | + * | | Device Registers | + * | | | + * m ----------------------------------> + * ^ | Memory Device Capability Header| + * | ----------------------------------- + * | | Mailbox Capability Header | + * | ----------------------------------- + * | | Device Capability Header | + * | ----------------------------------- + * | | Device Cap Array Register | + * 0 +---------------------------------+ + * + */ + +#define CXL_DEVICE_CAP_HDR1_OFFSET 0x10 /* Figure 138 */ +#define CXL_DEVICE_CAP_REG_SIZE 0x10 /* 8.2.8.2 */ +#define CXL_DEVICE_CAPS_MAX 4 /* 8.2.8.2.1 + 8.2.8.5 */ +#define CXL_CAPS_SIZE \ + (CXL_DEVICE_CAP_REG_SIZE * (CXL_DEVICE_CAPS_MAX + 1)) /* +1 for header */ + +#define CXL_DEVICE_STATUS_REGISTERS_OFFSET 0x80 /* Read comment above */ +#define CXL_DEVICE_STATUS_REGISTERS_LENGTH 0x8 /* 8.2.8.3.1 */ + +#define CXL_MAILBOX_REGISTERS_OFFSET \ + (CXL_DEVICE_STATUS_REGISTERS_OFFSET + CXL_DEVICE_STATUS_REGISTERS_LENGTH) +#define CXL_MAILBOX_REGISTERS_SIZE 0x20 /* 8.2.8.4, Figure 139 */ +#define CXL_MAILBOX_PAYLOAD_SHIFT 11 +#define CXL_MAILBOX_MAX_PAYLOAD_SIZE (1 << CXL_MAILBOX_PAYLOAD_SHIFT) +#define CXL_MAILBOX_REGISTERS_LENGTH \ + (CXL_MAILBOX_REGISTERS_SIZE + CXL_MAILBOX_MAX_PAYLOAD_SIZE) + +#define CXL_MEMORY_DEVICE_REGISTERS_OFFSET \ + (CXL_MAILBOX_REGISTERS_OFFSET + CXL_MAILBOX_REGISTERS_LENGTH) +#define CXL_MEMORY_DEVICE_REGISTERS_LENGTH 0x8 + +#define CXL_MMIO_SIZE \ + (CXL_DEVICE_CAP_REG_SIZE + CXL_DEVICE_STATUS_REGISTERS_LENGTH + \ + CXL_MAILBOX_REGISTERS_LENGTH + CXL_MEMORY_DEVICE_REGISTERS_LENGTH) + +typedef struct cxl_device_state { + MemoryRegion device_registers; + + /* mmio for device capabilities array - 8.2.8.2 */ + MemoryRegion device; + MemoryRegion memory_device; + struct { + MemoryRegion caps; + union { + uint32_t caps_reg_state32[CXL_CAPS_SIZE / 4]; + uint64_t caps_reg_state64[CXL_CAPS_SIZE / 8]; + }; + }; + + /* mmio for the mailbox registers 8.2.8.4 */ + struct { + MemoryRegion mailbox; + uint16_t payload_size; + union { + uint8_t mbox_reg_state[CXL_MAILBOX_REGISTERS_LENGTH]; + uint16_t mbox_reg_state16[CXL_MAILBOX_REGISTERS_LENGTH / 2]; + uint32_t mbox_reg_state32[CXL_MAILBOX_REGISTERS_LENGTH / 4]; + uint64_t mbox_reg_state64[CXL_MAILBOX_REGISTERS_LENGTH / 8]; + }; + struct cel_log { + uint16_t opcode; + uint16_t effect; + } cel_log[1 << 16]; + size_t cel_size; + }; + + struct { + bool set; + uint64_t last_set; + uint64_t host_set; + } timestamp; + + /* memory region for persistent memory, HDM */ + uint64_t pmem_size; +} CXLDeviceState; + +/* Initialize the register block for a device */ +void cxl_device_register_block_init(Object *obj, CXLDeviceState *dev); + +/* Set up default values for the register block */ +void cxl_device_register_init_common(CXLDeviceState *dev); + +/* + * CXL 2.0 - 8.2.8.1 including errata F4 + * Documented as a 128 bit register, but 64 bit accesses and the second + * 64 bits are currently reserved. + */ +REG64(CXL_DEV_CAP_ARRAY, 0) /* Documented as 128 bit register but 64 byte accesses */ + FIELD(CXL_DEV_CAP_ARRAY, CAP_ID, 0, 16) + FIELD(CXL_DEV_CAP_ARRAY, CAP_VERSION, 16, 8) + FIELD(CXL_DEV_CAP_ARRAY, CAP_COUNT, 32, 16) + +/* + * Helper macro to initialize capability headers for CXL devices. + * + * In the 8.2.8.2, this is listed as a 128b register, but in 8.2.8, it says: + * > No registers defined in Section 8.2.8 are larger than 64-bits wide so that + * > is the maximum access size allowed for these registers. If this rule is not + * > followed, the behavior is undefined + * + * CXL 2.0 Errata F4 states futher that the layouts in the specification are + * shown as greater than 128 bits, but implementations are expected to + * use any size of access up to 64 bits. + * + * Here we've chosen to make it 4 dwords. The spec allows any pow2 multiple + * access to be used for a register up to 64 bits. + */ +#define CXL_DEVICE_CAPABILITY_HEADER_REGISTER(n, offset) \ + REG32(CXL_DEV_##n##_CAP_HDR0, offset) \ + FIELD(CXL_DEV_##n##_CAP_HDR0, CAP_ID, 0, 16) \ + FIELD(CXL_DEV_##n##_CAP_HDR0, CAP_VERSION, 16, 8) \ + REG32(CXL_DEV_##n##_CAP_HDR1, offset + 4) \ + FIELD(CXL_DEV_##n##_CAP_HDR1, CAP_OFFSET, 0, 32) \ + REG32(CXL_DEV_##n##_CAP_HDR2, offset + 8) \ + FIELD(CXL_DEV_##n##_CAP_HDR2, CAP_LENGTH, 0, 32) + +CXL_DEVICE_CAPABILITY_HEADER_REGISTER(DEVICE_STATUS, CXL_DEVICE_CAP_HDR1_OFFSET) +CXL_DEVICE_CAPABILITY_HEADER_REGISTER(MAILBOX, CXL_DEVICE_CAP_HDR1_OFFSET + \ + CXL_DEVICE_CAP_REG_SIZE) +CXL_DEVICE_CAPABILITY_HEADER_REGISTER(MEMORY_DEVICE, + CXL_DEVICE_CAP_HDR1_OFFSET + + CXL_DEVICE_CAP_REG_SIZE * 2) + +int cxl_initialize_mailbox(CXLDeviceState *cxl_dstate); +void cxl_process_mailbox(CXLDeviceState *cxl_dstate); + +#define cxl_device_cap_init(dstate, reg, cap_id) \ + do { \ + uint32_t *cap_hdrs = dstate->caps_reg_state32; \ + int which = R_CXL_DEV_##reg##_CAP_HDR0; \ + cap_hdrs[which] = \ + FIELD_DP32(cap_hdrs[which], CXL_DEV_##reg##_CAP_HDR0, \ + CAP_ID, cap_id); \ + cap_hdrs[which] = FIELD_DP32( \ + cap_hdrs[which], CXL_DEV_##reg##_CAP_HDR0, CAP_VERSION, 1); \ + cap_hdrs[which + 1] = \ + FIELD_DP32(cap_hdrs[which + 1], CXL_DEV_##reg##_CAP_HDR1, \ + CAP_OFFSET, CXL_##reg##_REGISTERS_OFFSET); \ + cap_hdrs[which + 2] = \ + FIELD_DP32(cap_hdrs[which + 2], CXL_DEV_##reg##_CAP_HDR2, \ + CAP_LENGTH, CXL_##reg##_REGISTERS_LENGTH); \ + } while (0) + +/* CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register */ +REG32(CXL_DEV_MAILBOX_CAP, 0) + FIELD(CXL_DEV_MAILBOX_CAP, PAYLOAD_SIZE, 0, 5) + FIELD(CXL_DEV_MAILBOX_CAP, INT_CAP, 5, 1) + FIELD(CXL_DEV_MAILBOX_CAP, BG_INT_CAP, 6, 1) + FIELD(CXL_DEV_MAILBOX_CAP, MSI_N, 7, 4) + +/* CXL 2.0 8.2.8.4.4 Mailbox Control Register */ +REG32(CXL_DEV_MAILBOX_CTRL, 4) + FIELD(CXL_DEV_MAILBOX_CTRL, DOORBELL, 0, 1) + FIELD(CXL_DEV_MAILBOX_CTRL, INT_EN, 1, 1) + FIELD(CXL_DEV_MAILBOX_CTRL, BG_INT_EN, 2, 1) + +/* CXL 2.0 8.2.8.4.5 Command Register */ +REG64(CXL_DEV_MAILBOX_CMD, 8) + FIELD(CXL_DEV_MAILBOX_CMD, COMMAND, 0, 8) + FIELD(CXL_DEV_MAILBOX_CMD, COMMAND_SET, 8, 8) + FIELD(CXL_DEV_MAILBOX_CMD, LENGTH, 16, 20) + +/* CXL 2.0 8.2.8.4.6 Mailbox Status Register */ +REG64(CXL_DEV_MAILBOX_STS, 0x10) + FIELD(CXL_DEV_MAILBOX_STS, BG_OP, 0, 1) + FIELD(CXL_DEV_MAILBOX_STS, ERRNO, 32, 16) + FIELD(CXL_DEV_MAILBOX_STS, VENDOR_ERRNO, 48, 16) + +/* CXL 2.0 8.2.8.4.7 Background Command Status Register */ +REG64(CXL_DEV_BG_CMD_STS, 0x18) + FIELD(CXL_DEV_BG_CMD_STS, OP, 0, 16) + FIELD(CXL_DEV_BG_CMD_STS, PERCENTAGE_COMP, 16, 7) + FIELD(CXL_DEV_BG_CMD_STS, RET_CODE, 32, 16) + FIELD(CXL_DEV_BG_CMD_STS, VENDOR_RET_CODE, 48, 16) + +/* CXL 2.0 8.2.8.4.8 Command Payload Registers */ +REG32(CXL_DEV_CMD_PAYLOAD, 0x20) + +REG64(CXL_MEM_DEV_STS, 0) + FIELD(CXL_MEM_DEV_STS, FATAL, 0, 1) + FIELD(CXL_MEM_DEV_STS, FW_HALT, 1, 1) + FIELD(CXL_MEM_DEV_STS, MEDIA_STATUS, 2, 2) + FIELD(CXL_MEM_DEV_STS, MBOX_READY, 4, 1) + FIELD(CXL_MEM_DEV_STS, RESET_NEEDED, 5, 3) + +struct CXLType3Dev { + /* Private */ + PCIDevice parent_obj; + + /* Properties */ + HostMemoryBackend *hostmem; + HostMemoryBackend *lsa; + + /* State */ + AddressSpace hostmem_as; + CXLComponentState cxl_cstate; + CXLDeviceState cxl_dstate; +}; + +#define TYPE_CXL_TYPE3 "cxl-type3" +OBJECT_DECLARE_TYPE(CXLType3Dev, CXLType3Class, CXL_TYPE3) + +struct CXLType3Class { + /* Private */ + PCIDeviceClass parent_class; + + /* public */ + uint64_t (*get_lsa_size)(CXLType3Dev *ct3d); + + uint64_t (*get_lsa)(CXLType3Dev *ct3d, void *buf, uint64_t size, + uint64_t offset); + void (*set_lsa)(CXLType3Dev *ct3d, const void *buf, uint64_t size, + uint64_t offset); +}; + +MemTxResult cxl_type3_read(PCIDevice *d, hwaddr host_addr, uint64_t *data, + unsigned size, MemTxAttrs attrs); +MemTxResult cxl_type3_write(PCIDevice *d, hwaddr host_addr, uint64_t data, + unsigned size, MemTxAttrs attrs); + +#endif diff --git a/include/hw/cxl/cxl_pci.h b/include/hw/cxl/cxl_pci.h new file mode 100644 index 0000000000..01cf002096 --- /dev/null +++ b/include/hw/cxl/cxl_pci.h @@ -0,0 +1,167 @@ +/* + * QEMU CXL PCI interfaces + * + * Copyright (c) 2020 Intel + * + * This work is licensed under the terms of the GNU GPL, version 2. See the + * COPYING file in the top-level directory. + */ + +#ifndef CXL_PCI_H +#define CXL_PCI_H + +#include "qemu/compiler.h" +#include "hw/pci/pci.h" +#include "hw/pci/pcie.h" + +#define CXL_VENDOR_ID 0x1e98 + +#define PCIE_DVSEC_HEADER1_OFFSET 0x4 /* Offset from start of extend cap */ +#define PCIE_DVSEC_ID_OFFSET 0x8 + +#define PCIE_CXL_DEVICE_DVSEC_LENGTH 0x38 +#define PCIE_CXL1_DEVICE_DVSEC_REVID 0 +#define PCIE_CXL2_DEVICE_DVSEC_REVID 1 + +#define EXTENSIONS_PORT_DVSEC_LENGTH 0x28 +#define EXTENSIONS_PORT_DVSEC_REVID 0 + +#define GPF_PORT_DVSEC_LENGTH 0x10 +#define GPF_PORT_DVSEC_REVID 0 + +#define GPF_DEVICE_DVSEC_LENGTH 0x10 +#define GPF_DEVICE_DVSEC_REVID 0 + +#define PCIE_FLEXBUS_PORT_DVSEC_LENGTH_2_0 0x14 +#define PCIE_FLEXBUS_PORT_DVSEC_REVID_2_0 1 + +#define REG_LOC_DVSEC_LENGTH 0x24 +#define REG_LOC_DVSEC_REVID 0 + +enum { + PCIE_CXL_DEVICE_DVSEC = 0, + NON_CXL_FUNCTION_MAP_DVSEC = 2, + EXTENSIONS_PORT_DVSEC = 3, + GPF_PORT_DVSEC = 4, + GPF_DEVICE_DVSEC = 5, + PCIE_FLEXBUS_PORT_DVSEC = 7, + REG_LOC_DVSEC = 8, + MLD_DVSEC = 9, + CXL20_MAX_DVSEC +}; + +typedef struct DVSECHeader { + uint32_t cap_hdr; + uint32_t dv_hdr1; + uint16_t dv_hdr2; +} QEMU_PACKED DVSECHeader; +QEMU_BUILD_BUG_ON(sizeof(DVSECHeader) != 10); + +/* + * CXL 2.0 devices must implement certain DVSEC IDs, and can [optionally] + * implement others. + * + * CXL 2.0 Device: 0, [2], 5, 8 + * CXL 2.0 RP: 3, 4, 7, 8 + * CXL 2.0 Upstream Port: [2], 7, 8 + * CXL 2.0 Downstream Port: 3, 4, 7, 8 + */ + +/* CXL 2.0 - 8.1.3 (ID 0001) */ +typedef struct CXLDVSECDevice { + DVSECHeader hdr; + uint16_t cap; + uint16_t ctrl; + uint16_t status; + uint16_t ctrl2; + uint16_t status2; + uint16_t lock; + uint16_t cap2; + uint32_t range1_size_hi; + uint32_t range1_size_lo; + uint32_t range1_base_hi; + uint32_t range1_base_lo; + uint32_t range2_size_hi; + uint32_t range2_size_lo; + uint32_t range2_base_hi; + uint32_t range2_base_lo; +} CXLDVSECDevice; +QEMU_BUILD_BUG_ON(sizeof(CXLDVSECDevice) != 0x38); + +/* CXL 2.0 - 8.1.5 (ID 0003) */ +typedef struct CXLDVSECPortExtensions { + DVSECHeader hdr; + uint16_t status; + uint16_t control; + uint8_t alt_bus_base; + uint8_t alt_bus_limit; + uint16_t alt_memory_base; + uint16_t alt_memory_limit; + uint16_t alt_prefetch_base; + uint16_t alt_prefetch_limit; + uint32_t alt_prefetch_base_high; + uint32_t alt_prefetch_limit_high; + uint32_t rcrb_base; + uint32_t rcrb_base_high; +} CXLDVSECPortExtensions; +QEMU_BUILD_BUG_ON(sizeof(CXLDVSECPortExtensions) != 0x28); + +#define PORT_CONTROL_OFFSET 0xc +#define PORT_CONTROL_UNMASK_SBR 1 +#define PORT_CONTROL_ALT_MEMID_EN 4 + +/* CXL 2.0 - 8.1.6 GPF DVSEC (ID 0004) */ +typedef struct CXLDVSECPortGPF { + DVSECHeader hdr; + uint16_t rsvd; + uint16_t phase1_ctrl; + uint16_t phase2_ctrl; +} CXLDVSECPortGPF; +QEMU_BUILD_BUG_ON(sizeof(CXLDVSECPortGPF) != 0x10); + +/* CXL 2.0 - 8.1.7 GPF DVSEC for CXL Device */ +typedef struct CXLDVSECDeviceGPF { + DVSECHeader hdr; + uint16_t phase2_duration; + uint32_t phase2_power; +} CXLDVSECDeviceGPF; +QEMU_BUILD_BUG_ON(sizeof(CXLDVSECDeviceGPF) != 0x10); + +/* CXL 2.0 - 8.1.8/8.2.1.3 Flex Bus DVSEC (ID 0007) */ +typedef struct CXLDVSECPortFlexBus { + DVSECHeader hdr; + uint16_t cap; + uint16_t ctrl; + uint16_t status; + uint32_t rcvd_mod_ts_data_phase1; +} CXLDVSECPortFlexBus; +QEMU_BUILD_BUG_ON(sizeof(CXLDVSECPortFlexBus) != 0x14); + +/* CXL 2.0 - 8.1.9 Register Locator DVSEC (ID 0008) */ +typedef struct CXLDVSECRegisterLocator { + DVSECHeader hdr; + uint16_t rsvd; + uint32_t reg0_base_lo; + uint32_t reg0_base_hi; + uint32_t reg1_base_lo; + uint32_t reg1_base_hi; + uint32_t reg2_base_lo; + uint32_t reg2_base_hi; +} CXLDVSECRegisterLocator; +QEMU_BUILD_BUG_ON(sizeof(CXLDVSECRegisterLocator) != 0x24); + +/* BAR Equivalence Indicator */ +#define BEI_BAR_10H 0 +#define BEI_BAR_14H 1 +#define BEI_BAR_18H 2 +#define BEI_BAR_1cH 3 +#define BEI_BAR_20H 4 +#define BEI_BAR_24H 5 + +/* Register Block Identifier */ +#define RBI_EMPTY 0 +#define RBI_COMPONENT_REG (1 << 8) +#define RBI_BAR_VIRT_ACL (2 << 8) +#define RBI_CXL_DEVICE_REG (3 << 8) + +#endif diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index bfa982a419..67653b0f9b 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -267,6 +267,7 @@ struct IntelIOMMUState { bool buggy_eim; /* Force buggy EIM unless eim=off */ uint8_t aw_bits; /* Host/IOVA address width (in bits) */ bool dma_drain; /* Whether DMA r/w draining enabled */ + bool dma_translation; /* Whether DMA translation supported */ /* * Protects IOMMU states in general. Currently it protects the diff --git a/include/hw/i386/microvm.h b/include/hw/i386/microvm.h index efcbd926fd..fad97a891d 100644 --- a/include/hw/i386/microvm.h +++ b/include/hw/i386/microvm.h @@ -67,8 +67,6 @@ #define PCIE_ECAM_SIZE 0x10000000 /* Machine type options */ -#define MICROVM_MACHINE_PIT "pit" -#define MICROVM_MACHINE_PIC "pic" #define MICROVM_MACHINE_RTC "rtc" #define MICROVM_MACHINE_PCIE "pcie" #define MICROVM_MACHINE_IOAPIC2 "ioapic2" @@ -86,8 +84,6 @@ struct MicrovmMachineState { X86MachineState parent; /* Machine type options */ - OnOffAuto pic; - OnOffAuto pit; OnOffAuto rtc; OnOffAuto pcie; OnOffAuto ioapic2; diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index aff8add155..ffcac5121e 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -46,7 +46,6 @@ typedef struct PCMachineState { bool acpi_build_enabled; bool smbus_enabled; bool sata_enabled; - bool pit_enabled; bool hpet_enabled; bool i8042_enabled; bool default_bus_bypass_iommu; @@ -64,7 +63,6 @@ typedef struct PCMachineState { #define PC_MACHINE_VMPORT "vmport" #define PC_MACHINE_SMBUS "smbus" #define PC_MACHINE_SATA "sata" -#define PC_MACHINE_PIT "pit" #define PC_MACHINE_I8042 "i8042" #define PC_MACHINE_MAX_FW_SIZE "max-fw-size" #define PC_MACHINE_SMBIOS_EP "smbios-entry-point-type" diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h index 4841a49f86..9089bdd99c 100644 --- a/include/hw/i386/x86.h +++ b/include/hw/i386/x86.h @@ -65,6 +65,8 @@ struct X86MachineState { OnOffAuto smm; OnOffAuto acpi; + OnOffAuto pit; + OnOffAuto pic; char *oem_id; char *oem_table_id; @@ -84,6 +86,8 @@ struct X86MachineState { #define X86_MACHINE_SMM "smm" #define X86_MACHINE_ACPI "acpi" +#define X86_MACHINE_PIT "pit" +#define X86_MACHINE_PIC "pic" #define X86_MACHINE_OEM_ID "x-oem-id" #define X86_MACHINE_OEM_TABLE_ID "x-oem-table-id" #define X86_MACHINE_BUS_LOCK_RATELIMIT "bus-lock-ratelimit" diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 3a32b8dd40..44dacfa224 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -194,6 +194,8 @@ enum { QEMU_PCIE_LNKSTA_DLLLA = (1 << QEMU_PCIE_LNKSTA_DLLLA_BITNR), #define QEMU_PCIE_EXTCAP_INIT_BITNR 9 QEMU_PCIE_EXTCAP_INIT = (1 << QEMU_PCIE_EXTCAP_INIT_BITNR), +#define QEMU_PCIE_CXL_BITNR 10 + QEMU_PCIE_CAP_CXL = (1 << QEMU_PCIE_CXL_BITNR), }; #define TYPE_PCI_DEVICE "pci-device" @@ -201,6 +203,12 @@ typedef struct PCIDeviceClass PCIDeviceClass; DECLARE_OBJ_CHECKERS(PCIDevice, PCIDeviceClass, PCI_DEVICE, TYPE_PCI_DEVICE) +/* + * Implemented by devices that can be plugged on CXL buses. In the spec, this is + * actually a "CXL Component, but we name it device to match the PCI naming. + */ +#define INTERFACE_CXL_DEVICE "cxl-device" + /* Implemented by devices that can be plugged on PCI Express buses */ #define INTERFACE_PCIE_DEVICE "pci-express-device" @@ -400,6 +408,7 @@ typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); #define TYPE_PCI_BUS "PCI" OBJECT_DECLARE_TYPE(PCIBus, PCIBusClass, PCI_BUS) #define TYPE_PCIE_BUS "PCIE" +#define TYPE_CXL_BUS "CXL" typedef void (*pci_bus_dev_fn)(PCIBus *b, PCIDevice *d, void *opaque); typedef void (*pci_bus_fn)(PCIBus *b, void *opaque); @@ -762,6 +771,11 @@ static inline void pci_irq_pulse(PCIDevice *pci_dev) pci_irq_deassert(pci_dev); } +static inline int pci_is_cxl(const PCIDevice *d) +{ + return d->cap_present & QEMU_PCIE_CAP_CXL; +} + static inline int pci_is_express(const PCIDevice *d) { return d->cap_present & QEMU_PCI_CAP_EXPRESS; diff --git a/include/hw/pci/pci_bridge.h b/include/hw/pci/pci_bridge.h index 30691a6e57..ba4bafac7c 100644 --- a/include/hw/pci/pci_bridge.h +++ b/include/hw/pci/pci_bridge.h @@ -28,6 +28,7 @@ #include "hw/pci/pci.h" #include "hw/pci/pci_bus.h" +#include "hw/cxl/cxl.h" #include "qom/object.h" typedef struct PCIBridgeWindows PCIBridgeWindows; @@ -80,6 +81,25 @@ struct PCIBridge { #define PCI_BRIDGE_DEV_PROP_CHASSIS_NR "chassis_nr" #define PCI_BRIDGE_DEV_PROP_MSI "msi" #define PCI_BRIDGE_DEV_PROP_SHPC "shpc" +typedef struct CXLHost CXLHost; + +struct PXBDev { + /*< private >*/ + PCIDevice parent_obj; + /*< public >*/ + + uint8_t bus_nr; + uint16_t numa_node; + bool bypass_iommu; + struct cxl_dev { + CXLHost *cxl_host_bridge; /* Pointer to a CXLHost */ + } cxl; +}; + +typedef struct PXBDev PXBDev; +#define TYPE_PXB_CXL_DEVICE "pxb-cxl" +DECLARE_INSTANCE_CHECKER(PXBDev, PXB_CXL_DEV, + TYPE_PXB_CXL_DEVICE) int pci_bridge_ssvid_init(PCIDevice *dev, uint8_t offset, uint16_t svid, uint16_t ssid, diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h index 347440d42c..eb94e7e85c 100644 --- a/include/hw/pci/pci_bus.h +++ b/include/hw/pci/pci_bus.h @@ -24,6 +24,8 @@ enum PCIBusFlags { PCI_BUS_IS_ROOT = 0x0001, /* PCIe extended configuration space is accessible on this bus */ PCI_BUS_EXTENDED_CONFIG_SPACE = 0x0002, + /* This is a CXL Type BUS */ + PCI_BUS_CXL = 0x0004, }; struct PCIBus { @@ -53,6 +55,11 @@ struct PCIBus { Notifier machine_done; }; +static inline bool pci_bus_is_cxl(PCIBus *bus) +{ + return !!(bus->flags & PCI_BUS_CXL); +} + static inline bool pci_bus_is_root(PCIBus *bus) { return !!(bus->flags & PCI_BUS_IS_ROOT); diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h index 11abe22d46..898083b86f 100644 --- a/include/hw/pci/pci_ids.h +++ b/include/hw/pci/pci_ids.h @@ -53,6 +53,7 @@ #define PCI_BASE_CLASS_MEMORY 0x05 #define PCI_CLASS_MEMORY_RAM 0x0500 #define PCI_CLASS_MEMORY_FLASH 0x0501 +#define PCI_CLASS_MEMORY_CXL 0x0502 #define PCI_CLASS_MEMORY_OTHER 0x0580 #define PCI_BASE_CLASS_BRIDGE 0x06 diff --git a/include/hw/pci/pcie_host.h b/include/hw/pci/pcie_host.h index 076457b270..82d92177da 100644 --- a/include/hw/pci/pcie_host.h +++ b/include/hw/pci/pcie_host.h @@ -60,15 +60,15 @@ void pcie_host_mmcfg_update(PCIExpressHost *e, /* * PCI express ECAM (Enhanced Configuration Address Mapping) format. * AKA mmcfg address - * bit 20 - 28: bus number + * bit 20 - 27: bus number * bit 15 - 19: device number * bit 12 - 14: function number * bit 0 - 11: offset in configuration space of a given device */ -#define PCIE_MMCFG_SIZE_MAX (1ULL << 29) +#define PCIE_MMCFG_SIZE_MAX (1ULL << 28) #define PCIE_MMCFG_SIZE_MIN (1ULL << 20) #define PCIE_MMCFG_BUS_BIT 20 -#define PCIE_MMCFG_BUS_MASK 0x1ff +#define PCIE_MMCFG_BUS_MASK 0xff #define PCIE_MMCFG_DEVFN_BIT 12 #define PCIE_MMCFG_DEVFN_MASK 0xff #define PCIE_MMCFG_CONFOFFSET_MASK 0xfff diff --git a/include/hw/pci/pcie_port.h b/include/hw/pci/pcie_port.h index e25b289ce8..7b8193061a 100644 --- a/include/hw/pci/pcie_port.h +++ b/include/hw/pci/pcie_port.h @@ -39,6 +39,8 @@ struct PCIEPort { void pcie_port_init_reg(PCIDevice *d); +PCIDevice *pcie_find_port_by_pn(PCIBus *bus, uint8_t pn); + #define TYPE_PCIE_SLOT "pcie-slot" OBJECT_DECLARE_SIMPLE_TYPE(PCIESlot, PCIE_SLOT) diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h index e44a41bb70..c6e693cd3f 100644 --- a/include/hw/virtio/vhost-user.h +++ b/include/hw/virtio/vhost-user.h @@ -11,20 +11,61 @@ #include "chardev/char-fe.h" #include "hw/virtio/virtio.h" +/** + * VhostUserHostNotifier - notifier information for one queue + * @rcu: rcu_head for cleanup + * @mr: memory region of notifier + * @addr: current mapped address + * @unmap_addr: address to be un-mapped + * @idx: virtioqueue index + * + * The VhostUserHostNotifier entries are re-used. When an old mapping + * is to be released it is moved to @unmap_addr and @addr is replaced. + * Once the RCU process has completed the unmap @unmap_addr is + * cleared. + */ typedef struct VhostUserHostNotifier { struct rcu_head rcu; MemoryRegion mr; void *addr; void *unmap_addr; + int idx; } VhostUserHostNotifier; +/** + * VhostUserState - shared state for all vhost-user devices + * @chr: the character backend for the socket + * @notifiers: GPtrArray of @VhostUserHostnotifier + * @memory_slots: + */ typedef struct VhostUserState { CharBackend *chr; - VhostUserHostNotifier notifier[VIRTIO_QUEUE_MAX]; + GPtrArray *notifiers; int memory_slots; + bool supports_config; } VhostUserState; +/** + * vhost_user_init() - initialise shared vhost_user state + * @user: allocated area for storing shared state + * @chr: the chardev for the vhost socket + * @errp: error handle + * + * User can either directly g_new() space for the state or embed + * VhostUserState in their larger device structure and just point to + * it. + * + * Return: true on success, false on error while setting errp. + */ bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp); + +/** + * vhost_user_cleanup() - cleanup state + * @user: ptr to use state + * + * Cleans up shared state and notifiers, callee is responsible for + * freeing the @VhostUserState memory itself. + */ void vhost_user_cleanup(VhostUserState *user); #endif diff --git a/include/hw/virtio/vhost-vsock-common.h b/include/hw/virtio/vhost-vsock-common.h index 456a9c2365..93c782101d 100644 --- a/include/hw/virtio/vhost-vsock-common.h +++ b/include/hw/virtio/vhost-vsock-common.h @@ -44,7 +44,7 @@ int vhost_vsock_common_start(VirtIODevice *vdev); void vhost_vsock_common_stop(VirtIODevice *vdev); int vhost_vsock_common_pre_save(void *opaque); int vhost_vsock_common_post_load(void *opaque, int version_id); -void vhost_vsock_common_realize(VirtIODevice *vdev, const char *name); +void vhost_vsock_common_realize(VirtIODevice *vdev); void vhost_vsock_common_unrealize(VirtIODevice *vdev); uint64_t vhost_vsock_common_get_features(VirtIODevice *vdev, uint64_t features, Error **errp); diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 58a73e7b7a..b291fe4e24 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -61,6 +61,12 @@ typedef struct VhostDevConfigOps { } VhostDevConfigOps; struct vhost_memory; + +/** + * struct vhost_dev - common vhost_dev structure + * @vhost_ops: backend specific ops + * @config_ops: ops for config changes (see @vhost_dev_set_config_notifier) + */ struct vhost_dev { VirtIODevice *vdev; MemoryListener memory_listener; @@ -108,15 +114,129 @@ struct vhost_net { NetClientState *nc; }; +/** + * vhost_dev_init() - initialise the vhost interface + * @hdev: the common vhost_dev structure + * @opaque: opaque ptr passed to backend (vhost/vhost-user/vdpa) + * @backend_type: type of backend + * @busyloop_timeout: timeout for polling virtqueue + * @errp: error handle + * + * The initialisation of the vhost device will trigger the + * initialisation of the backend and potentially capability + * negotiation of backend interface. Configuration of the VirtIO + * itself won't happen until the interface is started. + * + * Return: 0 on success, non-zero on error while setting errp. + */ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, VhostBackendType backend_type, uint32_t busyloop_timeout, Error **errp); + +/** + * vhost_dev_cleanup() - tear down and cleanup vhost interface + * @hdev: the common vhost_dev structure + */ void vhost_dev_cleanup(struct vhost_dev *hdev); -int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev); -void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev); + +/** + * vhost_dev_enable_notifiers() - enable event notifiers + * @hdev: common vhost_dev structure + * @vdev: the VirtIODevice structure + * + * Enable notifications directly to the vhost device rather than being + * triggered by QEMU itself. Notifications should be enabled before + * the vhost device is started via @vhost_dev_start. + * + * Return: 0 on success, < 0 on error. + */ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev); + +/** + * vhost_dev_disable_notifiers - disable event notifications + * @hdev: common vhost_dev structure + * @vdev: the VirtIODevice structure + * + * Disable direct notifications to vhost device. + */ void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev); +/** + * vhost_dev_start() - start the vhost device + * @hdev: common vhost_dev structure + * @vdev: the VirtIODevice structure + * + * Starts the vhost device. From this point VirtIO feature negotiation + * can start and the device can start processing VirtIO transactions. + * + * Return: 0 on success, < 0 on error. + */ +int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev); + +/** + * vhost_dev_stop() - stop the vhost device + * @hdev: common vhost_dev structure + * @vdev: the VirtIODevice structure + * + * Stop the vhost device. After the device is stopped the notifiers + * can be disabled (@vhost_dev_disable_notifiers) and the device can + * be torn down (@vhost_dev_cleanup). + */ +void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev); + +/** + * DOC: vhost device configuration handling + * + * The VirtIO device configuration space is used for rarely changing + * or initialisation time parameters. The configuration can be updated + * by either the guest driver or the device itself. If the device can + * change the configuration over time the vhost handler should + * register a @VhostDevConfigOps structure with + * @vhost_dev_set_config_notifier so the guest can be notified. Some + * devices register a handler anyway and will signal an error if an + * unexpected config change happens. + */ + +/** + * vhost_dev_get_config() - fetch device configuration + * @hdev: common vhost_dev_structure + * @config: pointer to device appropriate config structure + * @config_len: size of device appropriate config structure + * + * Return: 0 on success, < 0 on error while setting errp + */ +int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config, + uint32_t config_len, Error **errp); + +/** + * vhost_dev_set_config() - set device configuration + * @hdev: common vhost_dev_structure + * @data: pointer to data to set + * @offset: offset into configuration space + * @size: length of set + * @flags: @VhostSetConfigType flags + * + * By use of @offset/@size a subset of the configuration space can be + * written to. The @flags are used to indicate if it is a normal + * transaction or related to migration. + * + * Return: 0 on success, non-zero on error + */ +int vhost_dev_set_config(struct vhost_dev *dev, const uint8_t *data, + uint32_t offset, uint32_t size, uint32_t flags); + +/** + * vhost_dev_set_config_notifier() - register VhostDevConfigOps + * @hdev: common vhost_dev_structure + * @ops: notifier ops + * + * If the device is expected to change configuration a notifier can be + * setup to handle the case. + */ +void vhost_dev_set_config_notifier(struct vhost_dev *dev, + const VhostDevConfigOps *ops); + + /* Test and clear masked event pending status. * Should be called after unmask to avoid losing events. */ @@ -136,14 +256,6 @@ int vhost_net_set_backend(struct vhost_dev *hdev, struct vhost_vring_file *file); int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write); -int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config, - uint32_t config_len, Error **errp); -int vhost_dev_set_config(struct vhost_dev *dev, const uint8_t *data, - uint32_t offset, uint32_t size, uint32_t flags); -/* notifier callback in case vhost device config space changed - */ -void vhost_dev_set_config_notifier(struct vhost_dev *dev, - const VhostDevConfigOps *ops); void vhost_dev_reset_inflight(struct vhost_inflight *inflight); void vhost_dev_free_inflight(struct vhost_inflight *inflight); diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h index 2179b75703..afff9e158e 100644 --- a/include/hw/virtio/virtio-gpu.h +++ b/include/hw/virtio/virtio-gpu.h @@ -22,6 +22,7 @@ #include "sysemu/vhost-user-backend.h" #include "standard-headers/linux/virtio_gpu.h" +#include "standard-headers/linux/virtio_ids.h" #include "qom/object.h" #define TYPE_VIRTIO_GPU_BASE "virtio-gpu-base" @@ -37,8 +38,6 @@ OBJECT_DECLARE_SIMPLE_TYPE(VirtIOGPUGL, VIRTIO_GPU_GL) #define TYPE_VHOST_USER_GPU "vhost-user-gpu" OBJECT_DECLARE_SIMPLE_TYPE(VhostUserGPU, VHOST_USER_GPU) -#define VIRTIO_ID_GPU 16 - struct virtio_gpu_simple_resource { uint32_t resource_id; uint32_t width; diff --git a/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h similarity index 100% rename from hw/virtio/virtio-pci.h rename to include/hw/virtio/virtio-pci.h diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index b62a35fdca..db1c0ddf6b 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -22,6 +22,7 @@ #include "standard-headers/linux/virtio_config.h" #include "standard-headers/linux/virtio_ring.h" #include "qom/object.h" +#include "hw/virtio/vhost.h" /* A guest should never accept this. It implies negotiation is broken. */ #define VIRTIO_F_BAD_FEATURE 30 @@ -102,6 +103,7 @@ struct VirtIODevice bool started; bool start_on_kick; /* when virtio 1.0 feature has not been negotiated */ bool disable_legacy_check; + bool vhost_started; VMChangeStateEntry *vmstate; char *bus_name; uint8_t device_endian; @@ -160,13 +162,14 @@ struct VirtioDeviceClass { int (*post_load)(VirtIODevice *vdev); const VMStateDescription *vmsd; bool (*primary_unplug_pending)(void *opaque); + struct vhost_dev *(*get_vhost)(VirtIODevice *vdev); }; void virtio_instance_init_common(Object *proxy_obj, void *data, size_t vdev_size, const char *vdev_name); -void virtio_init(VirtIODevice *vdev, const char *name, - uint16_t device_id, size_t config_size); +void virtio_init(VirtIODevice *vdev, uint16_t device_id, size_t config_size); + void virtio_cleanup(VirtIODevice *vdev); void virtio_error(VirtIODevice *vdev, const char *fmt, ...) G_GNUC_PRINTF(2, 3); diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 1e9fe47c03..df1e69ee72 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -306,7 +306,9 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name, err: if (i) { - qemu_del_net_client(ncs[0]); + for (i--; i >= 0; i--) { + qemu_del_net_client(ncs[i]); + } } qemu_close(vdpa_device_fd); diff --git a/qapi/machine.json b/qapi/machine.json index 1e5bf02480..883ce3f9ea 100644 --- a/qapi/machine.json +++ b/qapi/machine.json @@ -502,6 +502,27 @@ 'dst': 'uint16', 'val': 'uint8' }} +## +# @CXLFixedMemoryWindowOptions: +# +# Create a CXL Fixed Memory Window +# +# @size: Size of the Fixed Memory Window in bytes. Must be a multiple +# of 256MiB. +# @interleave-granularity: Number of contiguous bytes for which +# accesses will go to a given interleave target. +# Accepted values [256, 512, 1k, 2k, 4k, 8k, 16k] +# @targets: Target root bridge IDs from -device ...,id= for each root +# bridge. +# +# Since 7.1 +## +{ 'struct': 'CXLFixedMemoryWindowOptions', + 'data': { + 'size': 'size', + '*interleave-granularity': 'size', + 'targets': ['str'] }} + ## # @X86CPURegister32: # diff --git a/qemu-options.hx b/qemu-options.hx index 60d2d728d6..b484640067 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -467,6 +467,44 @@ SRST -numa hmat-cache,node-id=1,size=10K,level=1,associativity=direct,policy=write-back,line=8 ERST +DEF("cxl-fixed-memory-window", HAS_ARG, QEMU_OPTION_cxl_fixed_memory_window, + "-cxl-fixed-memory-window targets.0=firsttarget,targets.1=secondtarget,size=size[,interleave-granularity=granularity]\n", + QEMU_ARCH_ALL) +SRST +``-cxl-fixed-memory-window targets.0=firsttarget,targets.1=secondtarget,size=size[,interleave-granularity=granularity]`` + Define a CXL Fixed Memory Window (CFMW). + + Described in the CXL 2.0 ECN: CEDT CFMWS & QTG _DSM. + + They are regions of Host Physical Addresses (HPA) on a system which + may be interleaved across one or more CXL host bridges. The system + software will assign particular devices into these windows and + configure the downstream Host-managed Device Memory (HDM) decoders + in root ports, switch ports and devices appropriately to meet the + interleave requirements before enabling the memory devices. + + ``targets.X=firsttarget`` provides the mapping to CXL host bridges + which may be identified by the id provied in the -device entry. + Multiple entries are needed to specify all the targets when + the fixed memory window represents interleaved memory. X is the + target index from 0. + + ``size=size`` sets the size of the CFMW. This must be a multiple of + 256MiB. The region will be aligned to 256MiB but the location is + platform and configuration dependent. + + ``interleave-granularity=granularity`` sets the granularity of + interleave. Default 256KiB. Only 256KiB, 512KiB, 1024KiB, 2048KiB + 4096KiB, 8192KiB and 16384KiB granularities supported. + + Example: + + :: + + -cxl-fixed-memory-window targets.0=cxl.0,targets.1=cxl.1,size=128G,interleave-granularity=512k + +ERST + DEF("add-fd", HAS_ARG, QEMU_OPTION_add_fd, "-add-fd fd=fd,set=set[,opaque=opaque]\n" " Add 'fd' to fd 'set'\n", QEMU_ARCH_ALL) diff --git a/scripts/device-crash-test b/scripts/device-crash-test index 4bfc68c008..a203b3fdea 100755 --- a/scripts/device-crash-test +++ b/scripts/device-crash-test @@ -93,6 +93,7 @@ ERROR_RULE_LIST = [ {'device':'pci-bridge', 'expected':True}, # Bridge chassis not specified. Each bridge is required to be assigned a unique chassis id > 0. {'device':'pci-bridge-seat', 'expected':True}, # Bridge chassis not specified. Each bridge is required to be assigned a unique chassis id > 0. {'device':'pxb', 'expected':True}, # Bridge chassis not specified. Each bridge is required to be assigned a unique chassis id > 0. + {'device':'pxb-cxl', 'expected':True}, # pxb-cxl devices cannot reside on a PCI bus. {'device':'scsi-block', 'expected':True}, # drive property not set {'device':'scsi-generic', 'expected':True}, # drive property not set {'device':'scsi-hd', 'expected':True}, # drive property not set diff --git a/softmmu/vl.c b/softmmu/vl.c index 219b23a573..84a31eba76 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -93,6 +93,7 @@ #include "qemu/config-file.h" #include "qemu/qemu-options.h" #include "qemu/main-loop.h" +#include "hw/cxl/cxl.h" #ifdef CONFIG_VIRTFS #include "fsdev/qemu-fsdev.h" #endif @@ -120,6 +121,7 @@ #include "qapi/qapi-visit-audio.h" #include "qapi/qapi-visit-block-core.h" #include "qapi/qapi-visit-compat.h" +#include "qapi/qapi-visit-machine.h" #include "qapi/qapi-visit-ui.h" #include "qapi/qapi-commands-block-core.h" #include "qapi/qapi-commands-migration.h" @@ -145,6 +147,12 @@ typedef struct BlockdevOptionsQueueEntry { typedef QSIMPLEQ_HEAD(, BlockdevOptionsQueueEntry) BlockdevOptionsQueue; +typedef struct CXLFMWOptionQueueEntry { + CXLFixedMemoryWindowOptions *opts; + Location loc; + QSIMPLEQ_ENTRY(CXLFMWOptionQueueEntry) entry; +} CXLFMWOptionQueueEntry; + typedef struct ObjectOption { ObjectOptions *opts; QTAILQ_ENTRY(ObjectOption) next; @@ -171,6 +179,8 @@ static int snapshot; static bool preconfig_requested; static QemuPluginList plugin_list = QTAILQ_HEAD_INITIALIZER(plugin_list); static BlockdevOptionsQueue bdo_queue = QSIMPLEQ_HEAD_INITIALIZER(bdo_queue); +static QSIMPLEQ_HEAD(, CXLFMWOptionQueueEntry) CXLFMW_opts = + QSIMPLEQ_HEAD_INITIALIZER(CXLFMW_opts); static bool nographic = false; static int mem_prealloc; /* force preallocation of physical target memory */ static const char *vga_model = NULL; @@ -1155,6 +1165,24 @@ static void parse_display(const char *p) } } +static void parse_cxl_fixed_memory_window(const char *optarg) +{ + CXLFMWOptionQueueEntry *cfmws_entry; + Visitor *v; + + v = qobject_input_visitor_new_str(optarg, "cxl-fixed-memory-window", + &error_fatal); + cfmws_entry = g_new(CXLFMWOptionQueueEntry, 1); + visit_type_CXLFixedMemoryWindowOptions(v, NULL, &cfmws_entry->opts, + &error_fatal); + if (!cfmws_entry->opts) { + exit(1); + } + visit_free(v); + loc_save(&cfmws_entry->loc); + QSIMPLEQ_INSERT_TAIL(&CXLFMW_opts, cfmws_entry, entry); +} + static inline bool nonempty_str(const char *str) { return str && *str; @@ -2017,6 +2045,20 @@ static void qemu_create_late_backends(void) qemu_semihosting_console_init(); } +static void cxl_set_opts(void) +{ + while (!QSIMPLEQ_EMPTY(&CXLFMW_opts)) { + CXLFMWOptionQueueEntry *cfmws_entry = QSIMPLEQ_FIRST(&CXLFMW_opts); + + loc_restore(&cfmws_entry->loc); + QSIMPLEQ_REMOVE_HEAD(&CXLFMW_opts, entry); + cxl_fixed_memory_window_config(current_machine, cfmws_entry->opts, + &error_fatal); + qapi_free_CXLFixedMemoryWindowOptions(cfmws_entry->opts); + g_free(cfmws_entry); + } +} + static void qemu_resolve_machine_memdev(void) { if (ram_memdev_id) { @@ -2663,6 +2705,7 @@ void qmp_x_exit_preconfig(Error **errp) qemu_init_board(); qemu_create_cli_devices(); + cxl_fixed_memory_window_link_targets(errp); qemu_machine_creation_done(); if (loadvm) { @@ -2843,6 +2886,9 @@ void qemu_init(int argc, char **argv, char **envp) exit(1); } break; + case QEMU_OPTION_cxl_fixed_memory_window: + parse_cxl_fixed_memory_window(optarg); + break; case QEMU_OPTION_display: parse_display(optarg); break; @@ -3678,6 +3724,7 @@ void qemu_init(int argc, char **argv, char **envp) qemu_resolve_machine_memdev(); parse_numa_opts(current_machine); + cxl_set_opts(); if (vmstate_dump_file) { /* dump and exit */ diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index d0041c864b..b4cc3c2d68 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -99,7 +99,7 @@ static inline bool vu_has_protocol_feature(VuDev *dev, unsigned int fbit) return has_feature(dev->protocol_features, fbit); } -static const char * +const char * vu_request_to_string(unsigned int req) { #define REQ(req) [req] = #req diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h index cde9f07bb3..aea7ec5061 100644 --- a/subprojects/libvhost-user/libvhost-user.h +++ b/subprojects/libvhost-user/libvhost-user.h @@ -473,6 +473,15 @@ bool vu_init(VuDev *dev, */ void vu_deinit(VuDev *dev); + +/** + * vu_request_to_string: return string for vhost message request + * @req: VhostUserMsg request + * + * Returns a const string, do not free. + */ +const char *vu_request_to_string(unsigned int req); + /** * vu_dispatch: * @dev: a VuDev context diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c index 5eb955ce9a..7237378a7d 100644 --- a/target/i386/kvm/kvm-cpu.c +++ b/target/i386/kvm/kvm-cpu.c @@ -171,7 +171,7 @@ static void kvm_cpu_instance_init(CPUState *cs) /* only applies to builtin_x86_defs cpus */ if (!kvm_irqchip_in_kernel()) { x86_cpu_change_kvm_default("x2apic", "off"); - } else if (kvm_irqchip_is_split() && kvm_enable_x2apic()) { + } else if (kvm_irqchip_is_split()) { x86_cpu_change_kvm_default("kvm-msi-ext-dest-id", "on"); } diff --git a/tests/data/acpi/q35/CEDT.cxl b/tests/data/acpi/q35/CEDT.cxl new file mode 100644 index 0000000000..b8fa06b00e Binary files /dev/null and b/tests/data/acpi/q35/CEDT.cxl differ diff --git a/tests/data/acpi/q35/DSDT.cxl b/tests/data/acpi/q35/DSDT.cxl new file mode 100644 index 0000000000..c1206defed Binary files /dev/null and b/tests/data/acpi/q35/DSDT.cxl differ diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c index 5dddedabcd..a4a46e97f0 100644 --- a/tests/qtest/bios-tables-test.c +++ b/tests/qtest/bios-tables-test.c @@ -1536,6 +1536,49 @@ static void test_acpi_q35_viot(void) free_test_data(&data); } +static void test_acpi_q35_cxl(void) +{ + gchar *tmp_path = g_dir_make_tmp("qemu-test-cxl.XXXXXX", NULL); + gchar *params; + + test_data data = { + .machine = MACHINE_Q35, + .variant = ".cxl", + }; + /* + * A complex CXL setup. + */ + params = g_strdup_printf(" -machine cxl=on" + " -object memory-backend-file,id=cxl-mem1,mem-path=%s,size=256M" + " -object memory-backend-file,id=cxl-mem2,mem-path=%s,size=256M" + " -object memory-backend-file,id=cxl-mem3,mem-path=%s,size=256M" + " -object memory-backend-file,id=cxl-mem4,mem-path=%s,size=256M" + " -object memory-backend-file,id=lsa1,mem-path=%s,size=256M" + " -object memory-backend-file,id=lsa2,mem-path=%s,size=256M" + " -object memory-backend-file,id=lsa3,mem-path=%s,size=256M" + " -object memory-backend-file,id=lsa4,mem-path=%s,size=256M" + " -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1" + " -device pxb-cxl,bus_nr=222,bus=pcie.0,id=cxl.2" + " -device cxl-rp,port=0,bus=cxl.1,id=rp1,chassis=0,slot=2" + " -device cxl-type3,bus=rp1,memdev=cxl-mem1,lsa=lsa1" + " -device cxl-rp,port=1,bus=cxl.1,id=rp2,chassis=0,slot=3" + " -device cxl-type3,bus=rp2,memdev=cxl-mem2,lsa=lsa2" + " -device cxl-rp,port=0,bus=cxl.2,id=rp3,chassis=0,slot=5" + " -device cxl-type3,bus=rp3,memdev=cxl-mem3,lsa=lsa3" + " -device cxl-rp,port=1,bus=cxl.2,id=rp4,chassis=0,slot=6" + " -device cxl-type3,bus=rp4,memdev=cxl-mem4,lsa=lsa4" + " -cxl-fixed-memory-window targets.0=cxl.1,size=4G,interleave-granularity=8k" + " -cxl-fixed-memory-window targets.0=cxl.1,targets.1=cxl.2,size=4G,interleave-granularity=8k", + tmp_path, tmp_path, tmp_path, tmp_path, + tmp_path, tmp_path, tmp_path, tmp_path); + test_acpi_one(params, &data); + + g_free(params); + g_assert(g_rmdir(tmp_path) == 0); + g_free(tmp_path); + free_test_data(&data); +} + static void test_acpi_virt_viot(void) { test_data data = { @@ -1741,6 +1784,7 @@ int main(int argc, char *argv[]) qtest_add_func("acpi/q35/kvm/dmar", test_acpi_q35_kvm_dmar); } qtest_add_func("acpi/q35/viot", test_acpi_q35_viot); + qtest_add_func("acpi/q35/cxl", test_acpi_q35_cxl); qtest_add_func("acpi/q35/slic", test_acpi_q35_slic); } else if (strcmp(arch, "aarch64") == 0) { if (has_tcg) { diff --git a/tests/qtest/cxl-test.c b/tests/qtest/cxl-test.c new file mode 100644 index 0000000000..079011af6a --- /dev/null +++ b/tests/qtest/cxl-test.c @@ -0,0 +1,151 @@ +/* + * QTest testcase for CXL + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "libqtest-single.h" + +#define QEMU_PXB_CMD "-machine q35,cxl=on " \ + "-device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 " \ + "-cxl-fixed-memory-window targets.0=cxl.0,size=4G " + +#define QEMU_2PXB_CMD "-machine q35,cxl=on " \ + "-device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 " \ + "-device pxb-cxl,id=cxl.1,bus=pcie.0,bus_nr=53 " \ + "-cxl-fixed-memory-window targets.0=cxl.0,targets.1=cxl.1,size=4G " + +#define QEMU_RP "-device cxl-rp,id=rp0,bus=cxl.0,chassis=0,slot=0 " + +/* Dual ports on first pxb */ +#define QEMU_2RP "-device cxl-rp,id=rp0,bus=cxl.0,chassis=0,slot=0 " \ + "-device cxl-rp,id=rp1,bus=cxl.0,chassis=0,slot=1 " + +/* Dual ports on each of the pxb instances */ +#define QEMU_4RP "-device cxl-rp,id=rp0,bus=cxl.0,chassis=0,slot=0 " \ + "-device cxl-rp,id=rp1,bus=cxl.0,chassis=0,slot=1 " \ + "-device cxl-rp,id=rp2,bus=cxl.1,chassis=0,slot=2 " \ + "-device cxl-rp,id=rp3,bus=cxl.1,chassis=0,slot=3 " + +#define QEMU_T3D "-object memory-backend-file,id=cxl-mem0,mem-path=%s,size=256M " \ + "-object memory-backend-file,id=lsa0,mem-path=%s,size=256M " \ + "-device cxl-type3,bus=rp0,memdev=cxl-mem0,lsa=lsa0,id=cxl-pmem0 " + +#define QEMU_2T3D "-object memory-backend-file,id=cxl-mem0,mem-path=%s,size=256M " \ + "-object memory-backend-file,id=lsa0,mem-path=%s,size=256M " \ + "-device cxl-type3,bus=rp0,memdev=cxl-mem0,lsa=lsa0,id=cxl-pmem0 " \ + "-object memory-backend-file,id=cxl-mem1,mem-path=%s,size=256M " \ + "-object memory-backend-file,id=lsa1,mem-path=%s,size=256M " \ + "-device cxl-type3,bus=rp1,memdev=cxl-mem1,lsa=lsa1,id=cxl-pmem1 " + +#define QEMU_4T3D "-object memory-backend-file,id=cxl-mem0,mem-path=%s,size=256M " \ + "-object memory-backend-file,id=lsa0,mem-path=%s,size=256M " \ + "-device cxl-type3,bus=rp0,memdev=cxl-mem0,lsa=lsa0,id=cxl-pmem0 " \ + "-object memory-backend-file,id=cxl-mem1,mem-path=%s,size=256M " \ + "-object memory-backend-file,id=lsa1,mem-path=%s,size=256M " \ + "-device cxl-type3,bus=rp1,memdev=cxl-mem1,lsa=lsa1,id=cxl-pmem1 " \ + "-object memory-backend-file,id=cxl-mem2,mem-path=%s,size=256M " \ + "-object memory-backend-file,id=lsa2,mem-path=%s,size=256M " \ + "-device cxl-type3,bus=rp2,memdev=cxl-mem2,lsa=lsa2,id=cxl-pmem2 " \ + "-object memory-backend-file,id=cxl-mem3,mem-path=%s,size=256M " \ + "-object memory-backend-file,id=lsa3,mem-path=%s,size=256M " \ + "-device cxl-type3,bus=rp3,memdev=cxl-mem3,lsa=lsa3,id=cxl-pmem3 " + +static void cxl_basic_hb(void) +{ + qtest_start("-machine q35,cxl=on"); + qtest_end(); +} + +static void cxl_basic_pxb(void) +{ + qtest_start("-machine q35,cxl=on -device pxb-cxl,bus=pcie.0"); + qtest_end(); +} + +static void cxl_pxb_with_window(void) +{ + qtest_start(QEMU_PXB_CMD); + qtest_end(); +} + +static void cxl_2pxb_with_window(void) +{ + qtest_start(QEMU_2PXB_CMD); + qtest_end(); +} + +static void cxl_root_port(void) +{ + qtest_start(QEMU_PXB_CMD QEMU_RP); + qtest_end(); +} + +static void cxl_2root_port(void) +{ + qtest_start(QEMU_PXB_CMD QEMU_2RP); + qtest_end(); +} + +static void cxl_t3d(void) +{ + g_autoptr(GString) cmdline = g_string_new(NULL); + char template[] = "/tmp/cxl-test-XXXXXX"; + const char *tmpfs; + + tmpfs = mkdtemp(template); + + g_string_printf(cmdline, QEMU_PXB_CMD QEMU_RP QEMU_T3D, tmpfs, tmpfs); + + qtest_start(cmdline->str); + qtest_end(); +} + +static void cxl_1pxb_2rp_2t3d(void) +{ + g_autoptr(GString) cmdline = g_string_new(NULL); + char template[] = "/tmp/cxl-test-XXXXXX"; + const char *tmpfs; + + tmpfs = mkdtemp(template); + + g_string_printf(cmdline, QEMU_PXB_CMD QEMU_2RP QEMU_2T3D, + tmpfs, tmpfs, tmpfs, tmpfs); + + qtest_start(cmdline->str); + qtest_end(); +} + +static void cxl_2pxb_4rp_4t3d(void) +{ + g_autoptr(GString) cmdline = g_string_new(NULL); + char template[] = "/tmp/cxl-test-XXXXXX"; + const char *tmpfs; + + tmpfs = mkdtemp(template); + + g_string_printf(cmdline, QEMU_2PXB_CMD QEMU_4RP QEMU_4T3D, + tmpfs, tmpfs, tmpfs, tmpfs, tmpfs, tmpfs, + tmpfs, tmpfs); + + qtest_start(cmdline->str); + qtest_end(); +} + +int main(int argc, char **argv) +{ + g_test_init(&argc, &argv, NULL); + + qtest_add_func("/pci/cxl/basic_hostbridge", cxl_basic_hb); + qtest_add_func("/pci/cxl/basic_pxb", cxl_basic_pxb); + qtest_add_func("/pci/cxl/pxb_with_window", cxl_pxb_with_window); + qtest_add_func("/pci/cxl/pxb_x2_with_window", cxl_2pxb_with_window); + qtest_add_func("/pci/cxl/rp", cxl_root_port); + qtest_add_func("/pci/cxl/rp_x2", cxl_2root_port); + qtest_add_func("/pci/cxl/type3_device", cxl_t3d); + qtest_add_func("/pci/cxl/rp_x2_type3_x2", cxl_1pxb_2rp_2t3d); + qtest_add_func("/pci/cxl/pxb_x2_root_port_x4_type3_x4", cxl_2pxb_4rp_4t3d); + return g_test_run(); +} diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build index b425484920..31287a9173 100644 --- a/tests/qtest/meson.build +++ b/tests/qtest/meson.build @@ -35,6 +35,9 @@ qtests_pci = \ (config_all_devices.has_key('CONFIG_VGA') ? ['display-vga-test'] : []) + \ (config_all_devices.has_key('CONFIG_IVSHMEM_DEVICE') ? ['ivshmem-test'] : []) +qtests_cxl = \ + (config_all_devices.has_key('CONFIG_CXL') ? ['cxl-test'] : []) + qtests_i386 = \ (slirp.found() ? ['pxe-test', 'test-netfilter'] : []) + \ (config_host.has_key('CONFIG_POSIX') ? ['test-filter-mirror'] : []) + \ @@ -74,6 +77,7 @@ qtests_i386 = \ slirp.found() ? ['virtio-net-failover'] : []) + \ (unpack_edk2_blobs ? ['bios-tables-test'] : []) + \ qtests_pci + \ + qtests_cxl + \ ['fdc-test', 'ide-test', 'hd-geo-test',