-----BEGIN PGP SIGNATURE-----

iQFEBAABCAAuFiEEUAN8t5cGD3bwIa1WyjViTGqRccYFAlp7p/IQHGZhbXpAcmVk
 aGF0LmNvbQAKCRDKNWJMapFxxsc7B/458EEIOOS5Md4X7d51wsAUYwaRXtnoRUeD
 sO73+kslnr/egEY9wo/BCyfNkOSwVFj0tdgbz97FzJIAQ01TJG96BiwW/NqccTG2
 PwEXjfv3RALOw+hNmW6vaHDyXiwgiseDwoz0RwgDjlWae55ebUmizex5ChuTIgxy
 ePEANQbOsfjET37S3hYERmJltDyeM+cwYbegrv/Aky7lMatpQeJQlOEYPdRalvti
 PEK4WnexQIVCguLuuqy2LcbIsyjvF8xSh2m2kUCiTFARWwciEI8q1rxLKO3V6Kth
 Q2rcF2QBVqV2CJp3GdhOv+oG4CUl/T0Anf6HQTfF2hmVru/8cDjw
 =Qmvf
 -----END PGP SIGNATURE-----

Merge remote-tracking branch 'remotes/famz/tags/staging-pull-request' into staging

# gpg: Signature made Thu 08 Feb 2018 01:29:22 GMT
# gpg:                using RSA key CA35624C6A9171C6
# gpg: Good signature from "Fam Zheng <famz@redhat.com>"
# Primary key fingerprint: 5003 7CB7 9706 0F76 F021  AD56 CA35 624C 6A91 71C6

* remotes/famz/tags/staging-pull-request:
  docs: Add docs/devel/testing.rst
  qapi: Add NVMe driver options to the schema
  docs: Add section for NVMe VFIO driver
  block: Move NVMe constants to a separate header
  qemu-img: Map bench buffer
  block/nvme: Implement .bdrv_(un)register_buf
  block: Introduce buf register API
  block: Add VFIO based NVMe driver
  util: Introduce vfio helpers
  stubs: Add stubs for ram block API
  curl: convert to CoQueue
  coroutine-lock: make qemu_co_enter_next thread-safe
  coroutine-lock: convert CoQueue to use QemuLockable
  lockable: add QemuLockable
  test-coroutine: add simple CoMutex test
  docker: change Fedora base image to fedora:27

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2018-02-08 14:31:51 +00:00
commit 008a51bbb3
33 changed files with 3580 additions and 829 deletions

View File

@ -1888,6 +1888,12 @@ L: qemu-block@nongnu.org
S: Supported
F: block/null.c
NVMe Block Driver
M: Fam Zheng <famz@redhat.com>
L: qemu-block@nongnu.org
S: Supported
F: block/nvme*
Bootdevice
M: Gonglei <arei.gonglei@huawei.com>
S: Maintained

View File

@ -11,6 +11,7 @@ block-obj-$(CONFIG_POSIX) += file-posix.o
block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
block-obj-y += null.o mirror.o commit.o io.o
block-obj-y += throttle-groups.o
block-obj-$(CONFIG_LINUX) += nvme.o
block-obj-y += nbd.o nbd-client.o sheepdog.o
block-obj-$(CONFIG_LIBISCSI) += iscsi.o

View File

@ -2096,3 +2096,13 @@ static void blk_root_drained_end(BdrvChild *child)
}
}
}
void blk_register_buf(BlockBackend *blk, void *host, size_t size)
{
bdrv_register_buf(blk_bs(blk), host, size);
}
void blk_unregister_buf(BlockBackend *blk, void *host)
{
bdrv_unregister_buf(blk_bs(blk), host);
}

View File

@ -101,8 +101,6 @@ typedef struct CURLAIOCB {
size_t start;
size_t end;
QSIMPLEQ_ENTRY(CURLAIOCB) next;
} CURLAIOCB;
typedef struct CURLSocket {
@ -138,7 +136,7 @@ typedef struct BDRVCURLState {
bool accept_range;
AioContext *aio_context;
QemuMutex mutex;
QSIMPLEQ_HEAD(, CURLAIOCB) free_state_waitq;
CoQueue free_state_waitq;
char *username;
char *password;
char *proxyusername;
@ -538,7 +536,6 @@ static int curl_init_state(BDRVCURLState *s, CURLState *state)
/* Called with s->mutex held. */
static void curl_clean_state(CURLState *s)
{
CURLAIOCB *next;
int j;
for (j = 0; j < CURL_NUM_ACB; j++) {
assert(!s->acb[j]);
@ -556,13 +553,7 @@ static void curl_clean_state(CURLState *s)
s->in_use = 0;
next = QSIMPLEQ_FIRST(&s->s->free_state_waitq);
if (next) {
QSIMPLEQ_REMOVE_HEAD(&s->s->free_state_waitq, next);
qemu_mutex_unlock(&s->s->mutex);
aio_co_wake(next->co);
qemu_mutex_lock(&s->s->mutex);
}
qemu_co_enter_next(&s->s->free_state_waitq, &s->s->mutex);
}
static void curl_parse_filename(const char *filename, QDict *options,
@ -784,7 +775,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
}
DPRINTF("CURL: Opening %s\n", file);
QSIMPLEQ_INIT(&s->free_state_waitq);
qemu_co_queue_init(&s->free_state_waitq);
s->aio_context = bdrv_get_aio_context(bs);
s->url = g_strdup(file);
qemu_mutex_lock(&s->mutex);
@ -888,10 +879,7 @@ static void curl_setup_preadv(BlockDriverState *bs, CURLAIOCB *acb)
if (state) {
break;
}
QSIMPLEQ_INSERT_TAIL(&s->free_state_waitq, acb, next);
qemu_mutex_unlock(&s->mutex);
qemu_coroutine_yield();
qemu_mutex_lock(&s->mutex);
qemu_co_queue_wait(&s->free_state_waitq, &s->mutex);
}
if (curl_init_state(s, state) < 0) {

View File

@ -2825,3 +2825,27 @@ void bdrv_io_unplug(BlockDriverState *bs)
bdrv_io_unplug(child->bs);
}
}
void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
{
BdrvChild *child;
if (bs->drv && bs->drv->bdrv_register_buf) {
bs->drv->bdrv_register_buf(bs, host, size);
}
QLIST_FOREACH(child, &bs->children, next) {
bdrv_register_buf(child->bs, host, size);
}
}
void bdrv_unregister_buf(BlockDriverState *bs, void *host)
{
BdrvChild *child;
if (bs->drv && bs->drv->bdrv_unregister_buf) {
bs->drv->bdrv_unregister_buf(bs, host);
}
QLIST_FOREACH(child, &bs->children, next) {
bdrv_unregister_buf(child->bs, host);
}
}

1201
block/nvme.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -124,3 +124,24 @@ vxhs_open_iio_open(const char *host) "Failed to connect to storage agent on host
vxhs_parse_uri_hostinfo(char *host, int port) "Host: IP %s, Port %d"
vxhs_close(char *vdisk_guid) "Closing vdisk %s"
vxhs_get_creds(const char *cacert, const char *client_key, const char *client_cert) "cacert %s, client_key %s, client_cert %s"
# block/nvme.c
nvme_kick(void *s, int queue) "s %p queue %d"
nvme_dma_flush_queue_wait(void *s) "s %p"
nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d"
nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
nvme_handle_event(void *s) "s %p"
nvme_poll_cb(void *s) "s %p"
nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
nvme_dma_map_flush(void *s) "s %p"
nvme_free_req_queue_wait(void *q) "q %p"
nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64
nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d"

486
docs/devel/testing.rst Normal file
View File

@ -0,0 +1,486 @@
===============
Testing in QEMU
===============
This document describes the testing infrastructure in QEMU.
Testing with "make check"
=========================
The "make check" testing family includes most of the C based tests in QEMU. For
a quick help, run ``make check-help`` from the source tree.
The usual way to run these tests is:
.. code::
make check
which includes QAPI schema tests, unit tests, and QTests. Different sub-types
of "make check" tests will be explained below.
Before running tests, it is best to build QEMU programs first. Some tests
expect the executables to exist and will fail with obscure messages if they
cannot find them.
Unit tests
----------
Unit tests, which can be invoked with ``make check-unit``, are simple C tests
that typically link to individual QEMU object files and exercise them by
calling exported functions.
If you are writing new code in QEMU, consider adding a unit test, especially
for utility modules that are relatively stateless or have few dependencies. To
add a new unit test:
1. Create a new source file. For example, ``tests/foo-test.c``.
2. Write the test. Normally you would include the header file which exports
the module API, then verify the interface behaves as expected from your
test. The test code should be organized with the glib testing framework.
Copying and modifying an existing test is usually a good idea.
3. Add the test to ``tests/Makefile.include``. First, name the unit test
program and add it to ``$(check-unit-y)``; then add a rule to build the
executable. Optionally, you can add a magical variable to support ``gcov``.
For example:
.. code::
check-unit-y += tests/foo-test$(EXESUF)
tests/foo-test$(EXESUF): tests/foo-test.o $(test-util-obj-y)
...
gcov-files-foo-test-y = util/foo.c
Since unit tests don't require environment variables, the simplest way to debug
a unit test failure is often directly invoking it or even running it under
``gdb``. However there can still be differences in behavior between ``make``
invocations and your manual run, due to ``$MALLOC_PERTURB_`` environment
variable (which affects memory reclamation and catches invalid pointers better)
and gtester options. If necessary, you can run
.. code::
make check-unit V=1
and copy the actual command line which executes the unit test, then run
it from the command line.
QTest
-----
QTest is a device emulation testing framework. It can be very useful to test
device models; it could also control certain aspects of QEMU (such as virtual
clock stepping), with a special purpose "qtest" protocol. Refer to the
documentation in ``qtest.c`` for more details of the protocol.
QTest cases can be executed with
.. code::
make check-qtest
The QTest library is implemented by ``tests/libqtest.c`` and the API is defined
in ``tests/libqtest.h``.
Consider adding a new QTest case when you are introducing a new virtual
hardware, or extending one if you are adding functionalities to an existing
virtual device.
On top of libqtest, a higher level library, ``libqos``, was created to
encapsulate common tasks of device drivers, such as memory management and
communicating with system buses or devices. Many virtual device tests use
libqos instead of directly calling into libqtest.
Steps to add a new QTest case are:
1. Create a new source file for the test. (More than one file can be added as
necessary.) For example, ``tests/test-foo-device.c``.
2. Write the test code with the glib and libqtest/libqos API. See also existing
tests and the library headers for reference.
3. Register the new test in ``tests/Makefile.include``. Add the test executable
name to an appropriate ``check-qtest-*-y`` variable. For example:
``check-qtest-generic-y = tests/test-foo-device$(EXESUF)``
4. Add object dependencies of the executable in the Makefile, including the
test source file(s) and other interesting objects. For example:
``tests/test-foo-device$(EXESUF): tests/test-foo-device.o $(libqos-obj-y)``
Debugging a QTest failure is slightly harder than the unit test because the
tests look up QEMU program names in the environment variables, such as
``QTEST_QEMU_BINARY`` and ``QTEST_QEMU_IMG``, and also because it is not easy
to attach gdb to the QEMU process spawned from the test. But manual invoking
and using gdb on the test is still simple to do: find out the actual command
from the output of
.. code::
make check-qtest V=1
which you can run manually.
QAPI schema tests
-----------------
The QAPI schema tests validate the QAPI parser used by QMP, by feeding
predefined input to the parser and comparing the result with the reference
output.
The input/output data is managed under the ``tests/qapi-schema`` directory.
Each test case includes four files that have a common base name:
* ``${casename}.json`` - the file contains the JSON input for feeding the
parser
* ``${casename}.out`` - the file contains the expected stdout from the parser
* ``${casename}.err`` - the file contains the expected stderr from the parser
* ``${casename}.exit`` - the expected error code
Consider adding a new QAPI schema test when you are making a change on the QAPI
parser (either fixing a bug or extending/modifying the syntax). To do this:
1. Add four files for the new case as explained above. For example:
``$EDITOR tests/qapi-schema/foo.{json,out,err,exit}``.
2. Add the new test in ``tests/Makefile.include``. For example:
``qapi-schema += foo.json``
check-block
-----------
``make check-block`` is a legacy command to invoke block layer iotests and is
rarely used. See "QEMU iotests" section below for more information.
GCC gcov support
----------------
``gcov`` is a GCC tool to analyze the testing coverage by instrumenting the
tested code. To use it, configure QEMU with ``--enable-gcov`` option and build.
Then run ``make check`` as usual. There will be additional ``gcov`` output as
the testing goes on, showing the test coverage percentage numbers per analyzed
source file. More detailed reports can be obtained by running ``gcov`` command
on the output files under ``$build_dir/tests/``, please read the ``gcov``
documentation for more information.
QEMU iotests
============
QEMU iotests, under the directory ``tests/qemu-iotests``, is the testing
framework widely used to test block layer related features. It is higher level
than "make check" tests and 99% of the code is written in bash or Python
scripts. The testing success criteria is golden output comparison, and the
test files are named with numbers.
To run iotests, make sure QEMU is built successfully, then switch to the
``tests/qemu-iotests`` directory under the build directory, and run ``./check``
with desired arguments from there.
By default, "raw" format and "file" protocol is used; all tests will be
executed, except the unsupported ones. You can override the format and protocol
with arguments:
.. code::
# test with qcow2 format
./check -qcow2
# or test a different protocol
./check -nbd
It's also possible to list test numbers explicitly:
.. code::
# run selected cases with qcow2 format
./check -qcow2 001 030 153
Cache mode can be selected with the "-c" option, which may help reveal bugs
that are specific to certain cache mode.
More options are supported by the ``./check`` script, run ``./check -h`` for
help.
Writing a new test case
-----------------------
Consider writing a tests case when you are making any changes to the block
layer. An iotest case is usually the choice for that. There are already many
test cases, so it is possible that extending one of them may achieve the goal
and save the boilerplate to create one. (Unfortunately, there isn't a 100%
reliable way to find a related one out of hundreds of tests. One approach is
using ``git grep``.)
Usually an iotest case consists of two files. One is an executable that
produces output to stdout and stderr, the other is the expected reference
output. They are given the same number in file names. E.g. Test script ``055``
and reference output ``055.out``.
In rare cases, when outputs differ between cache mode ``none`` and others, a
``.out.nocache`` file is added. In other cases, when outputs differ between
image formats, more than one ``.out`` files are created ending with the
respective format names, e.g. ``178.out.qcow2`` and ``178.out.raw``.
There isn't a hard rule about how to write a test script, but a new test is
usually a (copy and) modification of an existing case. There are a few
commonly used ways to create a test:
* A Bash script. It will make use of several environmental variables related
to the testing procedure, and could source a group of ``common.*`` libraries
for some common helper routines.
* A Python unittest script. Import ``iotests`` and create a subclass of
``iotests.QMPTestCase``, then call ``iotests.main`` method. The downside of
this approach is that the output is too scarce, and the script is considered
harder to debug.
* A simple Python script without using unittest module. This could also import
``iotests`` for launching QEMU and utilities etc, but it doesn't inherit
from ``iotests.QMPTestCase`` therefore doesn't use the Python unittest
execution. This is a combination of 1 and 2.
Pick the language per your preference since both Bash and Python have
comparable library support for invoking and interacting with QEMU programs. If
you opt for Python, it is strongly recommended to write Python 3 compatible
code.
Docker based tests
==================
Introduction
------------
The Docker testing framework in QEMU utilizes public Docker images to build and
test QEMU in predefined and widely accessible Linux environments. This makes
it possible to expand the test coverage across distros, toolchain flavors and
library versions.
Prerequisites
-------------
Install "docker" with the system package manager and start the Docker service
on your development machine, then make sure you have the privilege to run
Docker commands. Typically it means setting up passwordless ``sudo docker``
command or login as root. For example:
.. code::
$ sudo yum install docker
$ # or `apt-get install docker` for Ubuntu, etc.
$ sudo systemctl start docker
$ sudo docker ps
The last command should print an empty table, to verify the system is ready.
An alternative method to set up permissions is by adding the current user to
"docker" group and making the docker daemon socket file (by default
``/var/run/docker.sock``) accessible to the group:
.. code::
$ sudo groupadd docker
$ sudo usermod $USER -G docker
$ sudo chown :docker /var/run/docker.sock
Note that any one of above configurations makes it possible for the user to
exploit the whole host with Docker bind mounting or other privileged
operations. So only do it on development machines.
Quickstart
----------
From source tree, type ``make docker`` to see the help. Testing can be started
without configuring or building QEMU (``configure`` and ``make`` are done in
the container, with parameters defined by the make target):
.. code::
make docker-test-build@min-glib
This will create a container instance using the ``min-glib`` image (the image
is downloaded and initialized automatically), in which the ``test-build`` job
is executed.
Images
------
Along with many other images, the ``min-glib`` image is defined in a Dockerfile
in ``tests/docker/dockefiles/``, called ``min-glib.docker``. ``make docker``
command will list all the available images.
To add a new image, simply create a new ``.docker`` file under the
``tests/docker/dockerfiles/`` directory.
A ``.pre`` script can be added beside the ``.docker`` file, which will be
executed before building the image under the build context directory. This is
mainly used to do necessary host side setup. One such setup is ``binfmt_misc``,
for example, to make qemu-user powered cross build containers work.
Tests
-----
Different tests are added to cover various configurations to build and test
QEMU. Docker tests are the executables under ``tests/docker`` named
``test-*``. They are typically shell scripts and are built on top of a shell
library, ``tests/docker/common.rc``, which provides helpers to find the QEMU
source and build it.
The full list of tests is printed in the ``make docker`` help.
Tools
-----
There are executables that are created to run in a specific Docker environment.
This makes it easy to write scripts that have heavy or special dependencies,
but are still very easy to use.
Currently the only tool is ``travis``, which mimics the Travis-CI tests in a
container. It runs in the ``travis`` image:
.. code::
make docker-travis@travis
Debugging a Docker test failure
-------------------------------
When CI tasks, maintainers or yourself report a Docker test failure, follow the
below steps to debug it:
1. Locally reproduce the failure with the reported command line. E.g. run
``make docker-test-mingw@fedora J=8``.
2. Add "V=1" to the command line, try again, to see the verbose output.
3. Further add "DEBUG=1" to the command line. This will pause in a shell prompt
in the container right before testing starts. You could either manually
build QEMU and run tests from there, or press Ctrl-D to let the Docker
testing continue.
4. If you press Ctrl-D, the same building and testing procedure will begin, and
will hopefully run into the error again. After that, you will be dropped to
the prompt for debug.
Options
-------
Various options can be used to affect how Docker tests are done. The full
list is in the ``make docker`` help text. The frequently used ones are:
* ``V=1``: the same as in top level ``make``. It will be propagated to the
container and enable verbose output.
* ``J=$N``: the number of parallel tasks in make commands in the container,
similar to the ``-j $N`` option in top level ``make``. (The ``-j`` option in
top level ``make`` will not be propagated into the container.)
* ``DEBUG=1``: enables debug. See the previous "Debugging a Docker test
failure" section.
VM testing
==========
This test suite contains scripts that bootstrap various guest images that have
necessary packages to build QEMU. The basic usage is documented in ``Makefile``
help which is displayed with ``make vm-test``.
Quickstart
----------
Run ``make vm-test`` to list available make targets. Invoke a specific make
command to run build test in an image. For example, ``make vm-build-freebsd``
will build the source tree in the FreeBSD image. The command can be executed
from either the source tree or the build dir; if the former, ``./configure`` is
not needed. The command will then generate the test image in ``./tests/vm/``
under the working directory.
Note: images created by the scripts accept a well-known RSA key pair for SSH
access, so they SHOULD NOT be exposed to external interfaces if you are
concerned about attackers taking control of the guest and potentially
exploiting a QEMU security bug to compromise the host.
QEMU binary
-----------
By default, qemu-system-x86_64 is searched in $PATH to run the guest. If there
isn't one, or if it is older than 2.10, the test won't work. In this case,
provide the QEMU binary in env var: ``QEMU=/path/to/qemu-2.10+``.
Make jobs
---------
The ``-j$X`` option in the make command line is not propagated into the VM,
specify ``J=$X`` to control the make jobs in the guest.
Debugging
---------
Add ``DEBUG=1`` and/or ``V=1`` to the make command to allow interactive
debugging and verbose output. If this is not enough, see the next section.
Manual invocation
-----------------
Each guest script is an executable script with the same command line options.
For example to work with the netbsd guest, use ``$QEMU_SRC/tests/vm/netbsd``:
.. code::
$ cd $QEMU_SRC/tests/vm
# To bootstrap the image
$ ./netbsd --build-image --image /var/tmp/netbsd.img
<...>
# To run an arbitrary command in guest (the output will not be echoed unless
# --debug is added)
$ ./netbsd --debug --image /var/tmp/netbsd.img uname -a
# To build QEMU in guest
$ ./netbsd --debug --image /var/tmp/netbsd.img --build-qemu $QEMU_SRC
# To get to an interactive shell
$ ./netbsd --interactive --image /var/tmp/netbsd.img sh
Adding new guests
-----------------
Please look at existing guest scripts for how to add new guests.
Most importantly, create a subclass of BaseVM and implement ``build_image()``
method and define ``BUILD_SCRIPT``, then finally call ``basevm.main()`` from
the script's ``main()``.
* Usually in ``build_image()``, a template image is downloaded from a
predefined URL. ``BaseVM._download_with_cache()`` takes care of the cache and
the checksum, so consider using it.
* Once the image is downloaded, users, SSH server and QEMU build deps should
be set up:
- Root password set to ``BaseVM.ROOT_PASS``
- User ``BaseVM.GUEST_USER`` is created, and password set to
``BaseVM.GUEST_PASS``
- SSH service is enabled and started on boot,
``$QEMU_SRC/tests/keys/id_rsa.pub`` is added to ssh's ``authorized_keys``
file of both root and the normal user
- DHCP client service is enabled and started on boot, so that it can
automatically configure the virtio-net-pci NIC and communicate with QEMU
user net (10.0.2.2)
- Necessary packages are installed to untar the source tarball and build
QEMU
* Write a proper ``BUILD_SCRIPT`` template, which should be a shell script that
untars a raw virtio-blk block device, which is the tarball data blob of the
QEMU source tree, then configure/build it. Running "make check" is also
recommended.
Image fuzzer testing
====================
An image fuzzer was added to exercise format drivers. Currently only qcow2 is
supported. To start the fuzzer, run
.. code::
tests/image-fuzzer/runner.py -c '[["qemu-img", "info", "$test_img"]]' /tmp/test qcow2
Alternatively, some command different from "qemu-img info" can be tested, by
changing the ``-c`` option.

View File

@ -785,6 +785,43 @@ warning: ssh server @code{ssh.example.com:22} does not support fsync
With sufficiently new versions of libssh2 and OpenSSH, @code{fsync} is
supported.
@node disk_images_nvme
@subsection NVMe disk images
NVM Express (NVMe) storage controllers can be accessed directly by a userspace
driver in QEMU. This bypasses the host kernel file system and block layers
while retaining QEMU block layer functionalities, such as block jobs, I/O
throttling, image formats, etc. Disk I/O performance is typically higher than
with @code{-drive file=/dev/sda} using either thread pool or linux-aio.
The controller will be exclusively used by the QEMU process once started. To be
able to share storage between multiple VMs and other applications on the host,
please use the file based protocols.
Before starting QEMU, bind the host NVMe controller to the host vfio-pci
driver. For example:
@example
# modprobe vfio-pci
# lspci -n -s 0000:06:0d.0
06:0d.0 0401: 1102:0002 (rev 08)
# echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
# echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id
# qemu-system-x86_64 -drive file=nvme://@var{host}:@var{bus}:@var{slot}.@var{func}/@var{namespace}
@end example
Alternative syntax using properties:
@example
qemu-system-x86_64 -drive file.driver=nvme,file.device=@var{host}:@var{bus}:@var{slot}.@var{func},file.namespace=@var{namespace}
@end example
@var{host}:@var{bus}:@var{slot}.@var{func} is the NVMe controller's PCI device
address on the host.
@var{namespace} is the NVMe namespace number, starting from 1.
@node disk_image_locking
@subsection Disk image file locking

View File

@ -20,13 +20,13 @@
static void fsdev_throttle_read_timer_cb(void *opaque)
{
FsThrottle *fst = opaque;
qemu_co_enter_next(&fst->throttled_reqs[false]);
qemu_co_enter_next(&fst->throttled_reqs[false], NULL);
}
static void fsdev_throttle_write_timer_cb(void *opaque)
{
FsThrottle *fst = opaque;
qemu_co_enter_next(&fst->throttled_reqs[true]);
qemu_co_enter_next(&fst->throttled_reqs[true], NULL);
}
void fsdev_throttle_parse_opts(QemuOpts *opts, FsThrottle *fst, Error **errp)

View File

@ -1,703 +1,7 @@
#ifndef HW_NVME_H
#define HW_NVME_H
#include "qemu/cutils.h"
typedef struct NvmeBar {
uint64_t cap;
uint32_t vs;
uint32_t intms;
uint32_t intmc;
uint32_t cc;
uint32_t rsvd1;
uint32_t csts;
uint32_t nssrc;
uint32_t aqa;
uint64_t asq;
uint64_t acq;
uint32_t cmbloc;
uint32_t cmbsz;
} NvmeBar;
enum NvmeCapShift {
CAP_MQES_SHIFT = 0,
CAP_CQR_SHIFT = 16,
CAP_AMS_SHIFT = 17,
CAP_TO_SHIFT = 24,
CAP_DSTRD_SHIFT = 32,
CAP_NSSRS_SHIFT = 33,
CAP_CSS_SHIFT = 37,
CAP_MPSMIN_SHIFT = 48,
CAP_MPSMAX_SHIFT = 52,
};
enum NvmeCapMask {
CAP_MQES_MASK = 0xffff,
CAP_CQR_MASK = 0x1,
CAP_AMS_MASK = 0x3,
CAP_TO_MASK = 0xff,
CAP_DSTRD_MASK = 0xf,
CAP_NSSRS_MASK = 0x1,
CAP_CSS_MASK = 0xff,
CAP_MPSMIN_MASK = 0xf,
CAP_MPSMAX_MASK = 0xf,
};
#define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK)
#define NVME_CAP_CQR(cap) (((cap) >> CAP_CQR_SHIFT) & CAP_CQR_MASK)
#define NVME_CAP_AMS(cap) (((cap) >> CAP_AMS_SHIFT) & CAP_AMS_MASK)
#define NVME_CAP_TO(cap) (((cap) >> CAP_TO_SHIFT) & CAP_TO_MASK)
#define NVME_CAP_DSTRD(cap) (((cap) >> CAP_DSTRD_SHIFT) & CAP_DSTRD_MASK)
#define NVME_CAP_NSSRS(cap) (((cap) >> CAP_NSSRS_SHIFT) & CAP_NSSRS_MASK)
#define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT) & CAP_CSS_MASK)
#define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK)
#define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK)
#define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \
<< CAP_MQES_SHIFT)
#define NVME_CAP_SET_CQR(cap, val) (cap |= (uint64_t)(val & CAP_CQR_MASK) \
<< CAP_CQR_SHIFT)
#define NVME_CAP_SET_AMS(cap, val) (cap |= (uint64_t)(val & CAP_AMS_MASK) \
<< CAP_AMS_SHIFT)
#define NVME_CAP_SET_TO(cap, val) (cap |= (uint64_t)(val & CAP_TO_MASK) \
<< CAP_TO_SHIFT)
#define NVME_CAP_SET_DSTRD(cap, val) (cap |= (uint64_t)(val & CAP_DSTRD_MASK) \
<< CAP_DSTRD_SHIFT)
#define NVME_CAP_SET_NSSRS(cap, val) (cap |= (uint64_t)(val & CAP_NSSRS_MASK) \
<< CAP_NSSRS_SHIFT)
#define NVME_CAP_SET_CSS(cap, val) (cap |= (uint64_t)(val & CAP_CSS_MASK) \
<< CAP_CSS_SHIFT)
#define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\
<< CAP_MPSMIN_SHIFT)
#define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\
<< CAP_MPSMAX_SHIFT)
enum NvmeCcShift {
CC_EN_SHIFT = 0,
CC_CSS_SHIFT = 4,
CC_MPS_SHIFT = 7,
CC_AMS_SHIFT = 11,
CC_SHN_SHIFT = 14,
CC_IOSQES_SHIFT = 16,
CC_IOCQES_SHIFT = 20,
};
enum NvmeCcMask {
CC_EN_MASK = 0x1,
CC_CSS_MASK = 0x7,
CC_MPS_MASK = 0xf,
CC_AMS_MASK = 0x7,
CC_SHN_MASK = 0x3,
CC_IOSQES_MASK = 0xf,
CC_IOCQES_MASK = 0xf,
};
#define NVME_CC_EN(cc) ((cc >> CC_EN_SHIFT) & CC_EN_MASK)
#define NVME_CC_CSS(cc) ((cc >> CC_CSS_SHIFT) & CC_CSS_MASK)
#define NVME_CC_MPS(cc) ((cc >> CC_MPS_SHIFT) & CC_MPS_MASK)
#define NVME_CC_AMS(cc) ((cc >> CC_AMS_SHIFT) & CC_AMS_MASK)
#define NVME_CC_SHN(cc) ((cc >> CC_SHN_SHIFT) & CC_SHN_MASK)
#define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK)
#define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK)
enum NvmeCstsShift {
CSTS_RDY_SHIFT = 0,
CSTS_CFS_SHIFT = 1,
CSTS_SHST_SHIFT = 2,
CSTS_NSSRO_SHIFT = 4,
};
enum NvmeCstsMask {
CSTS_RDY_MASK = 0x1,
CSTS_CFS_MASK = 0x1,
CSTS_SHST_MASK = 0x3,
CSTS_NSSRO_MASK = 0x1,
};
enum NvmeCsts {
NVME_CSTS_READY = 1 << CSTS_RDY_SHIFT,
NVME_CSTS_FAILED = 1 << CSTS_CFS_SHIFT,
NVME_CSTS_SHST_NORMAL = 0 << CSTS_SHST_SHIFT,
NVME_CSTS_SHST_PROGRESS = 1 << CSTS_SHST_SHIFT,
NVME_CSTS_SHST_COMPLETE = 2 << CSTS_SHST_SHIFT,
NVME_CSTS_NSSRO = 1 << CSTS_NSSRO_SHIFT,
};
#define NVME_CSTS_RDY(csts) ((csts >> CSTS_RDY_SHIFT) & CSTS_RDY_MASK)
#define NVME_CSTS_CFS(csts) ((csts >> CSTS_CFS_SHIFT) & CSTS_CFS_MASK)
#define NVME_CSTS_SHST(csts) ((csts >> CSTS_SHST_SHIFT) & CSTS_SHST_MASK)
#define NVME_CSTS_NSSRO(csts) ((csts >> CSTS_NSSRO_SHIFT) & CSTS_NSSRO_MASK)
enum NvmeAqaShift {
AQA_ASQS_SHIFT = 0,
AQA_ACQS_SHIFT = 16,
};
enum NvmeAqaMask {
AQA_ASQS_MASK = 0xfff,
AQA_ACQS_MASK = 0xfff,
};
#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK)
#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK)
enum NvmeCmblocShift {
CMBLOC_BIR_SHIFT = 0,
CMBLOC_OFST_SHIFT = 12,
};
enum NvmeCmblocMask {
CMBLOC_BIR_MASK = 0x7,
CMBLOC_OFST_MASK = 0xfffff,
};
#define NVME_CMBLOC_BIR(cmbloc) ((cmbloc >> CMBLOC_BIR_SHIFT) & \
CMBLOC_BIR_MASK)
#define NVME_CMBLOC_OFST(cmbloc)((cmbloc >> CMBLOC_OFST_SHIFT) & \
CMBLOC_OFST_MASK)
#define NVME_CMBLOC_SET_BIR(cmbloc, val) \
(cmbloc |= (uint64_t)(val & CMBLOC_BIR_MASK) << CMBLOC_BIR_SHIFT)
#define NVME_CMBLOC_SET_OFST(cmbloc, val) \
(cmbloc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBLOC_OFST_SHIFT)
enum NvmeCmbszShift {
CMBSZ_SQS_SHIFT = 0,
CMBSZ_CQS_SHIFT = 1,
CMBSZ_LISTS_SHIFT = 2,
CMBSZ_RDS_SHIFT = 3,
CMBSZ_WDS_SHIFT = 4,
CMBSZ_SZU_SHIFT = 8,
CMBSZ_SZ_SHIFT = 12,
};
enum NvmeCmbszMask {
CMBSZ_SQS_MASK = 0x1,
CMBSZ_CQS_MASK = 0x1,
CMBSZ_LISTS_MASK = 0x1,
CMBSZ_RDS_MASK = 0x1,
CMBSZ_WDS_MASK = 0x1,
CMBSZ_SZU_MASK = 0xf,
CMBSZ_SZ_MASK = 0xfffff,
};
#define NVME_CMBSZ_SQS(cmbsz) ((cmbsz >> CMBSZ_SQS_SHIFT) & CMBSZ_SQS_MASK)
#define NVME_CMBSZ_CQS(cmbsz) ((cmbsz >> CMBSZ_CQS_SHIFT) & CMBSZ_CQS_MASK)
#define NVME_CMBSZ_LISTS(cmbsz)((cmbsz >> CMBSZ_LISTS_SHIFT) & CMBSZ_LISTS_MASK)
#define NVME_CMBSZ_RDS(cmbsz) ((cmbsz >> CMBSZ_RDS_SHIFT) & CMBSZ_RDS_MASK)
#define NVME_CMBSZ_WDS(cmbsz) ((cmbsz >> CMBSZ_WDS_SHIFT) & CMBSZ_WDS_MASK)
#define NVME_CMBSZ_SZU(cmbsz) ((cmbsz >> CMBSZ_SZU_SHIFT) & CMBSZ_SZU_MASK)
#define NVME_CMBSZ_SZ(cmbsz) ((cmbsz >> CMBSZ_SZ_SHIFT) & CMBSZ_SZ_MASK)
#define NVME_CMBSZ_SET_SQS(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_SQS_MASK) << CMBSZ_SQS_SHIFT)
#define NVME_CMBSZ_SET_CQS(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_CQS_MASK) << CMBSZ_CQS_SHIFT)
#define NVME_CMBSZ_SET_LISTS(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_LISTS_MASK) << CMBSZ_LISTS_SHIFT)
#define NVME_CMBSZ_SET_RDS(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_RDS_MASK) << CMBSZ_RDS_SHIFT)
#define NVME_CMBSZ_SET_WDS(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_WDS_MASK) << CMBSZ_WDS_SHIFT)
#define NVME_CMBSZ_SET_SZU(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_SZU_MASK) << CMBSZ_SZU_SHIFT)
#define NVME_CMBSZ_SET_SZ(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_SZ_MASK) << CMBSZ_SZ_SHIFT)
#define NVME_CMBSZ_GETSIZE(cmbsz) \
(NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz))))
typedef struct NvmeCmd {
uint8_t opcode;
uint8_t fuse;
uint16_t cid;
uint32_t nsid;
uint64_t res1;
uint64_t mptr;
uint64_t prp1;
uint64_t prp2;
uint32_t cdw10;
uint32_t cdw11;
uint32_t cdw12;
uint32_t cdw13;
uint32_t cdw14;
uint32_t cdw15;
} NvmeCmd;
enum NvmeAdminCommands {
NVME_ADM_CMD_DELETE_SQ = 0x00,
NVME_ADM_CMD_CREATE_SQ = 0x01,
NVME_ADM_CMD_GET_LOG_PAGE = 0x02,
NVME_ADM_CMD_DELETE_CQ = 0x04,
NVME_ADM_CMD_CREATE_CQ = 0x05,
NVME_ADM_CMD_IDENTIFY = 0x06,
NVME_ADM_CMD_ABORT = 0x08,
NVME_ADM_CMD_SET_FEATURES = 0x09,
NVME_ADM_CMD_GET_FEATURES = 0x0a,
NVME_ADM_CMD_ASYNC_EV_REQ = 0x0c,
NVME_ADM_CMD_ACTIVATE_FW = 0x10,
NVME_ADM_CMD_DOWNLOAD_FW = 0x11,
NVME_ADM_CMD_FORMAT_NVM = 0x80,
NVME_ADM_CMD_SECURITY_SEND = 0x81,
NVME_ADM_CMD_SECURITY_RECV = 0x82,
};
enum NvmeIoCommands {
NVME_CMD_FLUSH = 0x00,
NVME_CMD_WRITE = 0x01,
NVME_CMD_READ = 0x02,
NVME_CMD_WRITE_UNCOR = 0x04,
NVME_CMD_COMPARE = 0x05,
NVME_CMD_WRITE_ZEROS = 0x08,
NVME_CMD_DSM = 0x09,
};
typedef struct NvmeDeleteQ {
uint8_t opcode;
uint8_t flags;
uint16_t cid;
uint32_t rsvd1[9];
uint16_t qid;
uint16_t rsvd10;
uint32_t rsvd11[5];
} NvmeDeleteQ;
typedef struct NvmeCreateCq {
uint8_t opcode;
uint8_t flags;
uint16_t cid;
uint32_t rsvd1[5];
uint64_t prp1;
uint64_t rsvd8;
uint16_t cqid;
uint16_t qsize;
uint16_t cq_flags;
uint16_t irq_vector;
uint32_t rsvd12[4];
} NvmeCreateCq;
#define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1)
#define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1)
typedef struct NvmeCreateSq {
uint8_t opcode;
uint8_t flags;
uint16_t cid;
uint32_t rsvd1[5];
uint64_t prp1;
uint64_t rsvd8;
uint16_t sqid;
uint16_t qsize;
uint16_t sq_flags;
uint16_t cqid;
uint32_t rsvd12[4];
} NvmeCreateSq;
#define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1)
#define NVME_SQ_FLAGS_QPRIO(sq_flags) ((sq_flags >> 1) & 0x3)
enum NvmeQueueFlags {
NVME_Q_PC = 1,
NVME_Q_PRIO_URGENT = 0,
NVME_Q_PRIO_HIGH = 1,
NVME_Q_PRIO_NORMAL = 2,
NVME_Q_PRIO_LOW = 3,
};
typedef struct NvmeIdentify {
uint8_t opcode;
uint8_t flags;
uint16_t cid;
uint32_t nsid;
uint64_t rsvd2[2];
uint64_t prp1;
uint64_t prp2;
uint32_t cns;
uint32_t rsvd11[5];
} NvmeIdentify;
typedef struct NvmeRwCmd {
uint8_t opcode;
uint8_t flags;
uint16_t cid;
uint32_t nsid;
uint64_t rsvd2;
uint64_t mptr;
uint64_t prp1;
uint64_t prp2;
uint64_t slba;
uint16_t nlb;
uint16_t control;
uint32_t dsmgmt;
uint32_t reftag;
uint16_t apptag;
uint16_t appmask;
} NvmeRwCmd;
enum {
NVME_RW_LR = 1 << 15,
NVME_RW_FUA = 1 << 14,
NVME_RW_DSM_FREQ_UNSPEC = 0,
NVME_RW_DSM_FREQ_TYPICAL = 1,
NVME_RW_DSM_FREQ_RARE = 2,
NVME_RW_DSM_FREQ_READS = 3,
NVME_RW_DSM_FREQ_WRITES = 4,
NVME_RW_DSM_FREQ_RW = 5,
NVME_RW_DSM_FREQ_ONCE = 6,
NVME_RW_DSM_FREQ_PREFETCH = 7,
NVME_RW_DSM_FREQ_TEMP = 8,
NVME_RW_DSM_LATENCY_NONE = 0 << 4,
NVME_RW_DSM_LATENCY_IDLE = 1 << 4,
NVME_RW_DSM_LATENCY_NORM = 2 << 4,
NVME_RW_DSM_LATENCY_LOW = 3 << 4,
NVME_RW_DSM_SEQ_REQ = 1 << 6,
NVME_RW_DSM_COMPRESSED = 1 << 7,
NVME_RW_PRINFO_PRACT = 1 << 13,
NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
NVME_RW_PRINFO_PRCHK_APP = 1 << 11,
NVME_RW_PRINFO_PRCHK_REF = 1 << 10,
};
typedef struct NvmeDsmCmd {
uint8_t opcode;
uint8_t flags;
uint16_t cid;
uint32_t nsid;
uint64_t rsvd2[2];
uint64_t prp1;
uint64_t prp2;
uint32_t nr;
uint32_t attributes;
uint32_t rsvd12[4];
} NvmeDsmCmd;
enum {
NVME_DSMGMT_IDR = 1 << 0,
NVME_DSMGMT_IDW = 1 << 1,
NVME_DSMGMT_AD = 1 << 2,
};
typedef struct NvmeDsmRange {
uint32_t cattr;
uint32_t nlb;
uint64_t slba;
} NvmeDsmRange;
enum NvmeAsyncEventRequest {
NVME_AER_TYPE_ERROR = 0,
NVME_AER_TYPE_SMART = 1,
NVME_AER_TYPE_IO_SPECIFIC = 6,
NVME_AER_TYPE_VENDOR_SPECIFIC = 7,
NVME_AER_INFO_ERR_INVALID_SQ = 0,
NVME_AER_INFO_ERR_INVALID_DB = 1,
NVME_AER_INFO_ERR_DIAG_FAIL = 2,
NVME_AER_INFO_ERR_PERS_INTERNAL_ERR = 3,
NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR = 4,
NVME_AER_INFO_ERR_FW_IMG_LOAD_ERR = 5,
NVME_AER_INFO_SMART_RELIABILITY = 0,
NVME_AER_INFO_SMART_TEMP_THRESH = 1,
NVME_AER_INFO_SMART_SPARE_THRESH = 2,
};
typedef struct NvmeAerResult {
uint8_t event_type;
uint8_t event_info;
uint8_t log_page;
uint8_t resv;
} NvmeAerResult;
typedef struct NvmeCqe {
uint32_t result;
uint32_t rsvd;
uint16_t sq_head;
uint16_t sq_id;
uint16_t cid;
uint16_t status;
} NvmeCqe;
enum NvmeStatusCodes {
NVME_SUCCESS = 0x0000,
NVME_INVALID_OPCODE = 0x0001,
NVME_INVALID_FIELD = 0x0002,
NVME_CID_CONFLICT = 0x0003,
NVME_DATA_TRAS_ERROR = 0x0004,
NVME_POWER_LOSS_ABORT = 0x0005,
NVME_INTERNAL_DEV_ERROR = 0x0006,
NVME_CMD_ABORT_REQ = 0x0007,
NVME_CMD_ABORT_SQ_DEL = 0x0008,
NVME_CMD_ABORT_FAILED_FUSE = 0x0009,
NVME_CMD_ABORT_MISSING_FUSE = 0x000a,
NVME_INVALID_NSID = 0x000b,
NVME_CMD_SEQ_ERROR = 0x000c,
NVME_LBA_RANGE = 0x0080,
NVME_CAP_EXCEEDED = 0x0081,
NVME_NS_NOT_READY = 0x0082,
NVME_NS_RESV_CONFLICT = 0x0083,
NVME_INVALID_CQID = 0x0100,
NVME_INVALID_QID = 0x0101,
NVME_MAX_QSIZE_EXCEEDED = 0x0102,
NVME_ACL_EXCEEDED = 0x0103,
NVME_RESERVED = 0x0104,
NVME_AER_LIMIT_EXCEEDED = 0x0105,
NVME_INVALID_FW_SLOT = 0x0106,
NVME_INVALID_FW_IMAGE = 0x0107,
NVME_INVALID_IRQ_VECTOR = 0x0108,
NVME_INVALID_LOG_ID = 0x0109,
NVME_INVALID_FORMAT = 0x010a,
NVME_FW_REQ_RESET = 0x010b,
NVME_INVALID_QUEUE_DEL = 0x010c,
NVME_FID_NOT_SAVEABLE = 0x010d,
NVME_FID_NOT_NSID_SPEC = 0x010f,
NVME_FW_REQ_SUSYSTEM_RESET = 0x0110,
NVME_CONFLICTING_ATTRS = 0x0180,
NVME_INVALID_PROT_INFO = 0x0181,
NVME_WRITE_TO_RO = 0x0182,
NVME_WRITE_FAULT = 0x0280,
NVME_UNRECOVERED_READ = 0x0281,
NVME_E2E_GUARD_ERROR = 0x0282,
NVME_E2E_APP_ERROR = 0x0283,
NVME_E2E_REF_ERROR = 0x0284,
NVME_CMP_FAILURE = 0x0285,
NVME_ACCESS_DENIED = 0x0286,
NVME_MORE = 0x2000,
NVME_DNR = 0x4000,
NVME_NO_COMPLETE = 0xffff,
};
typedef struct NvmeFwSlotInfoLog {
uint8_t afi;
uint8_t reserved1[7];
uint8_t frs1[8];
uint8_t frs2[8];
uint8_t frs3[8];
uint8_t frs4[8];
uint8_t frs5[8];
uint8_t frs6[8];
uint8_t frs7[8];
uint8_t reserved2[448];
} NvmeFwSlotInfoLog;
typedef struct NvmeErrorLog {
uint64_t error_count;
uint16_t sqid;
uint16_t cid;
uint16_t status_field;
uint16_t param_error_location;
uint64_t lba;
uint32_t nsid;
uint8_t vs;
uint8_t resv[35];
} NvmeErrorLog;
typedef struct NvmeSmartLog {
uint8_t critical_warning;
uint8_t temperature[2];
uint8_t available_spare;
uint8_t available_spare_threshold;
uint8_t percentage_used;
uint8_t reserved1[26];
uint64_t data_units_read[2];
uint64_t data_units_written[2];
uint64_t host_read_commands[2];
uint64_t host_write_commands[2];
uint64_t controller_busy_time[2];
uint64_t power_cycles[2];
uint64_t power_on_hours[2];
uint64_t unsafe_shutdowns[2];
uint64_t media_errors[2];
uint64_t number_of_error_log_entries[2];
uint8_t reserved2[320];
} NvmeSmartLog;
enum NvmeSmartWarn {
NVME_SMART_SPARE = 1 << 0,
NVME_SMART_TEMPERATURE = 1 << 1,
NVME_SMART_RELIABILITY = 1 << 2,
NVME_SMART_MEDIA_READ_ONLY = 1 << 3,
NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4,
};
enum LogIdentifier {
NVME_LOG_ERROR_INFO = 0x01,
NVME_LOG_SMART_INFO = 0x02,
NVME_LOG_FW_SLOT_INFO = 0x03,
};
typedef struct NvmePSD {
uint16_t mp;
uint16_t reserved;
uint32_t enlat;
uint32_t exlat;
uint8_t rrt;
uint8_t rrl;
uint8_t rwt;
uint8_t rwl;
uint8_t resv[16];
} NvmePSD;
typedef struct NvmeIdCtrl {
uint16_t vid;
uint16_t ssvid;
uint8_t sn[20];
uint8_t mn[40];
uint8_t fr[8];
uint8_t rab;
uint8_t ieee[3];
uint8_t cmic;
uint8_t mdts;
uint8_t rsvd255[178];
uint16_t oacs;
uint8_t acl;
uint8_t aerl;
uint8_t frmw;
uint8_t lpa;
uint8_t elpe;
uint8_t npss;
uint8_t rsvd511[248];
uint8_t sqes;
uint8_t cqes;
uint16_t rsvd515;
uint32_t nn;
uint16_t oncs;
uint16_t fuses;
uint8_t fna;
uint8_t vwc;
uint16_t awun;
uint16_t awupf;
uint8_t rsvd703[174];
uint8_t rsvd2047[1344];
NvmePSD psd[32];
uint8_t vs[1024];
} NvmeIdCtrl;
enum NvmeIdCtrlOacs {
NVME_OACS_SECURITY = 1 << 0,
NVME_OACS_FORMAT = 1 << 1,
NVME_OACS_FW = 1 << 2,
};
enum NvmeIdCtrlOncs {
NVME_ONCS_COMPARE = 1 << 0,
NVME_ONCS_WRITE_UNCORR = 1 << 1,
NVME_ONCS_DSM = 1 << 2,
NVME_ONCS_WRITE_ZEROS = 1 << 3,
NVME_ONCS_FEATURES = 1 << 4,
NVME_ONCS_RESRVATIONS = 1 << 5,
};
#define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf)
#define NVME_CTRL_SQES_MAX(sqes) (((sqes) >> 4) & 0xf)
#define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf)
#define NVME_CTRL_CQES_MAX(cqes) (((cqes) >> 4) & 0xf)
typedef struct NvmeFeatureVal {
uint32_t arbitration;
uint32_t power_mgmt;
uint32_t temp_thresh;
uint32_t err_rec;
uint32_t volatile_wc;
uint32_t num_queues;
uint32_t int_coalescing;
uint32_t *int_vector_config;
uint32_t write_atomicity;
uint32_t async_config;
uint32_t sw_prog_marker;
} NvmeFeatureVal;
#define NVME_ARB_AB(arb) (arb & 0x7)
#define NVME_ARB_LPW(arb) ((arb >> 8) & 0xff)
#define NVME_ARB_MPW(arb) ((arb >> 16) & 0xff)
#define NVME_ARB_HPW(arb) ((arb >> 24) & 0xff)
#define NVME_INTC_THR(intc) (intc & 0xff)
#define NVME_INTC_TIME(intc) ((intc >> 8) & 0xff)
enum NvmeFeatureIds {
NVME_ARBITRATION = 0x1,
NVME_POWER_MANAGEMENT = 0x2,
NVME_LBA_RANGE_TYPE = 0x3,
NVME_TEMPERATURE_THRESHOLD = 0x4,
NVME_ERROR_RECOVERY = 0x5,
NVME_VOLATILE_WRITE_CACHE = 0x6,
NVME_NUMBER_OF_QUEUES = 0x7,
NVME_INTERRUPT_COALESCING = 0x8,
NVME_INTERRUPT_VECTOR_CONF = 0x9,
NVME_WRITE_ATOMICITY = 0xa,
NVME_ASYNCHRONOUS_EVENT_CONF = 0xb,
NVME_SOFTWARE_PROGRESS_MARKER = 0x80
};
typedef struct NvmeRangeType {
uint8_t type;
uint8_t attributes;
uint8_t rsvd2[14];
uint64_t slba;
uint64_t nlb;
uint8_t guid[16];
uint8_t rsvd48[16];
} NvmeRangeType;
typedef struct NvmeLBAF {
uint16_t ms;
uint8_t ds;
uint8_t rp;
} NvmeLBAF;
typedef struct NvmeIdNs {
uint64_t nsze;
uint64_t ncap;
uint64_t nuse;
uint8_t nsfeat;
uint8_t nlbaf;
uint8_t flbas;
uint8_t mc;
uint8_t dpc;
uint8_t dps;
uint8_t res30[98];
NvmeLBAF lbaf[16];
uint8_t res192[192];
uint8_t vs[3712];
} NvmeIdNs;
#define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1))
#define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1)
#define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf))
#define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1)
#define NVME_ID_NS_MC_EXTENDED(mc) ((mc & 0x1))
#define NVME_ID_NS_DPC_LAST_EIGHT(dpc) ((dpc >> 4) & 0x1)
#define NVME_ID_NS_DPC_FIRST_EIGHT(dpc) ((dpc >> 3) & 0x1)
#define NVME_ID_NS_DPC_TYPE_3(dpc) ((dpc >> 2) & 0x1)
#define NVME_ID_NS_DPC_TYPE_2(dpc) ((dpc >> 1) & 0x1)
#define NVME_ID_NS_DPC_TYPE_1(dpc) ((dpc & 0x1))
#define NVME_ID_NS_DPC_TYPE_MASK 0x7
enum NvmeIdNsDps {
DPS_TYPE_NONE = 0,
DPS_TYPE_1 = 1,
DPS_TYPE_2 = 2,
DPS_TYPE_3 = 3,
DPS_TYPE_MASK = 0x7,
DPS_FIRST_EIGHT = 8,
};
static inline void _nvme_check_size(void)
{
QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4);
QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16);
QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16);
QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeCreateSq) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeIdentify) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeRwCmd) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
}
#include "block/nvme.h"
typedef struct NvmeAsyncEvent {
QSIMPLEQ_ENTRY(NvmeAsyncEvent) entry;

View File

@ -631,5 +631,14 @@ void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
uint32_t granularity, Error **errp);
/**
*
* bdrv_register_buf/bdrv_unregister_buf:
*
* Register/unregister a buffer for I/O. For example, VFIO drivers are
* interested to know the memory areas that would later be used for I/O, so
* that they can prepare IOMMU mapping etc., to get better performance.
*/
void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size);
void bdrv_unregister_buf(BlockDriverState *bs, void *host);
#endif

View File

@ -446,6 +446,15 @@ struct BlockDriver {
const char *name,
Error **errp);
/**
* Register/unregister a buffer for I/O. For example, when the driver is
* interested to know the memory areas that will later be used in iovs, so
* that it can do IOMMU mapping with VFIO etc., in order to get better
* performance. In the case of VFIO drivers, this callback is used to do
* DMA mapping for hot buffers.
*/
void (*bdrv_register_buf)(BlockDriverState *bs, void *host, size_t size);
void (*bdrv_unregister_buf)(BlockDriverState *bs, void *host);
QLIST_ENTRY(BlockDriver) list;
};

700
include/block/nvme.h Normal file
View File

@ -0,0 +1,700 @@
#ifndef BLOCK_NVME_H
#define BLOCK_NVME_H
typedef struct NvmeBar {
uint64_t cap;
uint32_t vs;
uint32_t intms;
uint32_t intmc;
uint32_t cc;
uint32_t rsvd1;
uint32_t csts;
uint32_t nssrc;
uint32_t aqa;
uint64_t asq;
uint64_t acq;
uint32_t cmbloc;
uint32_t cmbsz;
} NvmeBar;
enum NvmeCapShift {
CAP_MQES_SHIFT = 0,
CAP_CQR_SHIFT = 16,
CAP_AMS_SHIFT = 17,
CAP_TO_SHIFT = 24,
CAP_DSTRD_SHIFT = 32,
CAP_NSSRS_SHIFT = 33,
CAP_CSS_SHIFT = 37,
CAP_MPSMIN_SHIFT = 48,
CAP_MPSMAX_SHIFT = 52,
};
enum NvmeCapMask {
CAP_MQES_MASK = 0xffff,
CAP_CQR_MASK = 0x1,
CAP_AMS_MASK = 0x3,
CAP_TO_MASK = 0xff,
CAP_DSTRD_MASK = 0xf,
CAP_NSSRS_MASK = 0x1,
CAP_CSS_MASK = 0xff,
CAP_MPSMIN_MASK = 0xf,
CAP_MPSMAX_MASK = 0xf,
};
#define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK)
#define NVME_CAP_CQR(cap) (((cap) >> CAP_CQR_SHIFT) & CAP_CQR_MASK)
#define NVME_CAP_AMS(cap) (((cap) >> CAP_AMS_SHIFT) & CAP_AMS_MASK)
#define NVME_CAP_TO(cap) (((cap) >> CAP_TO_SHIFT) & CAP_TO_MASK)
#define NVME_CAP_DSTRD(cap) (((cap) >> CAP_DSTRD_SHIFT) & CAP_DSTRD_MASK)
#define NVME_CAP_NSSRS(cap) (((cap) >> CAP_NSSRS_SHIFT) & CAP_NSSRS_MASK)
#define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT) & CAP_CSS_MASK)
#define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK)
#define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK)
#define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \
<< CAP_MQES_SHIFT)
#define NVME_CAP_SET_CQR(cap, val) (cap |= (uint64_t)(val & CAP_CQR_MASK) \
<< CAP_CQR_SHIFT)
#define NVME_CAP_SET_AMS(cap, val) (cap |= (uint64_t)(val & CAP_AMS_MASK) \
<< CAP_AMS_SHIFT)
#define NVME_CAP_SET_TO(cap, val) (cap |= (uint64_t)(val & CAP_TO_MASK) \
<< CAP_TO_SHIFT)
#define NVME_CAP_SET_DSTRD(cap, val) (cap |= (uint64_t)(val & CAP_DSTRD_MASK) \
<< CAP_DSTRD_SHIFT)
#define NVME_CAP_SET_NSSRS(cap, val) (cap |= (uint64_t)(val & CAP_NSSRS_MASK) \
<< CAP_NSSRS_SHIFT)
#define NVME_CAP_SET_CSS(cap, val) (cap |= (uint64_t)(val & CAP_CSS_MASK) \
<< CAP_CSS_SHIFT)
#define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\
<< CAP_MPSMIN_SHIFT)
#define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\
<< CAP_MPSMAX_SHIFT)
enum NvmeCcShift {
CC_EN_SHIFT = 0,
CC_CSS_SHIFT = 4,
CC_MPS_SHIFT = 7,
CC_AMS_SHIFT = 11,
CC_SHN_SHIFT = 14,
CC_IOSQES_SHIFT = 16,
CC_IOCQES_SHIFT = 20,
};
enum NvmeCcMask {
CC_EN_MASK = 0x1,
CC_CSS_MASK = 0x7,
CC_MPS_MASK = 0xf,
CC_AMS_MASK = 0x7,
CC_SHN_MASK = 0x3,
CC_IOSQES_MASK = 0xf,
CC_IOCQES_MASK = 0xf,
};
#define NVME_CC_EN(cc) ((cc >> CC_EN_SHIFT) & CC_EN_MASK)
#define NVME_CC_CSS(cc) ((cc >> CC_CSS_SHIFT) & CC_CSS_MASK)
#define NVME_CC_MPS(cc) ((cc >> CC_MPS_SHIFT) & CC_MPS_MASK)
#define NVME_CC_AMS(cc) ((cc >> CC_AMS_SHIFT) & CC_AMS_MASK)
#define NVME_CC_SHN(cc) ((cc >> CC_SHN_SHIFT) & CC_SHN_MASK)
#define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK)
#define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK)
enum NvmeCstsShift {
CSTS_RDY_SHIFT = 0,
CSTS_CFS_SHIFT = 1,
CSTS_SHST_SHIFT = 2,
CSTS_NSSRO_SHIFT = 4,
};
enum NvmeCstsMask {
CSTS_RDY_MASK = 0x1,
CSTS_CFS_MASK = 0x1,
CSTS_SHST_MASK = 0x3,
CSTS_NSSRO_MASK = 0x1,
};
enum NvmeCsts {
NVME_CSTS_READY = 1 << CSTS_RDY_SHIFT,
NVME_CSTS_FAILED = 1 << CSTS_CFS_SHIFT,
NVME_CSTS_SHST_NORMAL = 0 << CSTS_SHST_SHIFT,
NVME_CSTS_SHST_PROGRESS = 1 << CSTS_SHST_SHIFT,
NVME_CSTS_SHST_COMPLETE = 2 << CSTS_SHST_SHIFT,
NVME_CSTS_NSSRO = 1 << CSTS_NSSRO_SHIFT,
};
#define NVME_CSTS_RDY(csts) ((csts >> CSTS_RDY_SHIFT) & CSTS_RDY_MASK)
#define NVME_CSTS_CFS(csts) ((csts >> CSTS_CFS_SHIFT) & CSTS_CFS_MASK)
#define NVME_CSTS_SHST(csts) ((csts >> CSTS_SHST_SHIFT) & CSTS_SHST_MASK)
#define NVME_CSTS_NSSRO(csts) ((csts >> CSTS_NSSRO_SHIFT) & CSTS_NSSRO_MASK)
enum NvmeAqaShift {
AQA_ASQS_SHIFT = 0,
AQA_ACQS_SHIFT = 16,
};
enum NvmeAqaMask {
AQA_ASQS_MASK = 0xfff,
AQA_ACQS_MASK = 0xfff,
};
#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK)
#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK)
enum NvmeCmblocShift {
CMBLOC_BIR_SHIFT = 0,
CMBLOC_OFST_SHIFT = 12,
};
enum NvmeCmblocMask {
CMBLOC_BIR_MASK = 0x7,
CMBLOC_OFST_MASK = 0xfffff,
};
#define NVME_CMBLOC_BIR(cmbloc) ((cmbloc >> CMBLOC_BIR_SHIFT) & \
CMBLOC_BIR_MASK)
#define NVME_CMBLOC_OFST(cmbloc)((cmbloc >> CMBLOC_OFST_SHIFT) & \
CMBLOC_OFST_MASK)
#define NVME_CMBLOC_SET_BIR(cmbloc, val) \
(cmbloc |= (uint64_t)(val & CMBLOC_BIR_MASK) << CMBLOC_BIR_SHIFT)
#define NVME_CMBLOC_SET_OFST(cmbloc, val) \
(cmbloc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBLOC_OFST_SHIFT)
enum NvmeCmbszShift {
CMBSZ_SQS_SHIFT = 0,
CMBSZ_CQS_SHIFT = 1,
CMBSZ_LISTS_SHIFT = 2,
CMBSZ_RDS_SHIFT = 3,
CMBSZ_WDS_SHIFT = 4,
CMBSZ_SZU_SHIFT = 8,
CMBSZ_SZ_SHIFT = 12,
};
enum NvmeCmbszMask {
CMBSZ_SQS_MASK = 0x1,
CMBSZ_CQS_MASK = 0x1,
CMBSZ_LISTS_MASK = 0x1,
CMBSZ_RDS_MASK = 0x1,
CMBSZ_WDS_MASK = 0x1,
CMBSZ_SZU_MASK = 0xf,
CMBSZ_SZ_MASK = 0xfffff,
};
#define NVME_CMBSZ_SQS(cmbsz) ((cmbsz >> CMBSZ_SQS_SHIFT) & CMBSZ_SQS_MASK)
#define NVME_CMBSZ_CQS(cmbsz) ((cmbsz >> CMBSZ_CQS_SHIFT) & CMBSZ_CQS_MASK)
#define NVME_CMBSZ_LISTS(cmbsz)((cmbsz >> CMBSZ_LISTS_SHIFT) & CMBSZ_LISTS_MASK)
#define NVME_CMBSZ_RDS(cmbsz) ((cmbsz >> CMBSZ_RDS_SHIFT) & CMBSZ_RDS_MASK)
#define NVME_CMBSZ_WDS(cmbsz) ((cmbsz >> CMBSZ_WDS_SHIFT) & CMBSZ_WDS_MASK)
#define NVME_CMBSZ_SZU(cmbsz) ((cmbsz >> CMBSZ_SZU_SHIFT) & CMBSZ_SZU_MASK)
#define NVME_CMBSZ_SZ(cmbsz) ((cmbsz >> CMBSZ_SZ_SHIFT) & CMBSZ_SZ_MASK)
#define NVME_CMBSZ_SET_SQS(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_SQS_MASK) << CMBSZ_SQS_SHIFT)
#define NVME_CMBSZ_SET_CQS(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_CQS_MASK) << CMBSZ_CQS_SHIFT)
#define NVME_CMBSZ_SET_LISTS(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_LISTS_MASK) << CMBSZ_LISTS_SHIFT)
#define NVME_CMBSZ_SET_RDS(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_RDS_MASK) << CMBSZ_RDS_SHIFT)
#define NVME_CMBSZ_SET_WDS(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_WDS_MASK) << CMBSZ_WDS_SHIFT)
#define NVME_CMBSZ_SET_SZU(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_SZU_MASK) << CMBSZ_SZU_SHIFT)
#define NVME_CMBSZ_SET_SZ(cmbsz, val) \
(cmbsz |= (uint64_t)(val & CMBSZ_SZ_MASK) << CMBSZ_SZ_SHIFT)
#define NVME_CMBSZ_GETSIZE(cmbsz) \
(NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz))))
typedef struct NvmeCmd {
uint8_t opcode;
uint8_t fuse;
uint16_t cid;
uint32_t nsid;
uint64_t res1;
uint64_t mptr;
uint64_t prp1;
uint64_t prp2;
uint32_t cdw10;
uint32_t cdw11;
uint32_t cdw12;
uint32_t cdw13;
uint32_t cdw14;
uint32_t cdw15;
} NvmeCmd;
enum NvmeAdminCommands {
NVME_ADM_CMD_DELETE_SQ = 0x00,
NVME_ADM_CMD_CREATE_SQ = 0x01,
NVME_ADM_CMD_GET_LOG_PAGE = 0x02,
NVME_ADM_CMD_DELETE_CQ = 0x04,
NVME_ADM_CMD_CREATE_CQ = 0x05,
NVME_ADM_CMD_IDENTIFY = 0x06,
NVME_ADM_CMD_ABORT = 0x08,
NVME_ADM_CMD_SET_FEATURES = 0x09,
NVME_ADM_CMD_GET_FEATURES = 0x0a,
NVME_ADM_CMD_ASYNC_EV_REQ = 0x0c,
NVME_ADM_CMD_ACTIVATE_FW = 0x10,
NVME_ADM_CMD_DOWNLOAD_FW = 0x11,
NVME_ADM_CMD_FORMAT_NVM = 0x80,
NVME_ADM_CMD_SECURITY_SEND = 0x81,
NVME_ADM_CMD_SECURITY_RECV = 0x82,
};
enum NvmeIoCommands {
NVME_CMD_FLUSH = 0x00,
NVME_CMD_WRITE = 0x01,
NVME_CMD_READ = 0x02,
NVME_CMD_WRITE_UNCOR = 0x04,
NVME_CMD_COMPARE = 0x05,
NVME_CMD_WRITE_ZEROS = 0x08,
NVME_CMD_DSM = 0x09,
};
typedef struct NvmeDeleteQ {
uint8_t opcode;
uint8_t flags;
uint16_t cid;
uint32_t rsvd1[9];
uint16_t qid;
uint16_t rsvd10;
uint32_t rsvd11[5];
} NvmeDeleteQ;
typedef struct NvmeCreateCq {
uint8_t opcode;
uint8_t flags;
uint16_t cid;
uint32_t rsvd1[5];
uint64_t prp1;
uint64_t rsvd8;
uint16_t cqid;
uint16_t qsize;
uint16_t cq_flags;
uint16_t irq_vector;
uint32_t rsvd12[4];
} NvmeCreateCq;
#define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1)
#define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1)
typedef struct NvmeCreateSq {
uint8_t opcode;
uint8_t flags;
uint16_t cid;
uint32_t rsvd1[5];
uint64_t prp1;
uint64_t rsvd8;
uint16_t sqid;
uint16_t qsize;
uint16_t sq_flags;
uint16_t cqid;
uint32_t rsvd12[4];
} NvmeCreateSq;
#define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1)
#define NVME_SQ_FLAGS_QPRIO(sq_flags) ((sq_flags >> 1) & 0x3)
enum NvmeQueueFlags {
NVME_Q_PC = 1,
NVME_Q_PRIO_URGENT = 0,
NVME_Q_PRIO_HIGH = 1,
NVME_Q_PRIO_NORMAL = 2,
NVME_Q_PRIO_LOW = 3,
};
typedef struct NvmeIdentify {
uint8_t opcode;
uint8_t flags;
uint16_t cid;
uint32_t nsid;
uint64_t rsvd2[2];
uint64_t prp1;
uint64_t prp2;
uint32_t cns;
uint32_t rsvd11[5];
} NvmeIdentify;
typedef struct NvmeRwCmd {
uint8_t opcode;
uint8_t flags;
uint16_t cid;
uint32_t nsid;
uint64_t rsvd2;
uint64_t mptr;
uint64_t prp1;
uint64_t prp2;
uint64_t slba;
uint16_t nlb;
uint16_t control;
uint32_t dsmgmt;
uint32_t reftag;
uint16_t apptag;
uint16_t appmask;
} NvmeRwCmd;
enum {
NVME_RW_LR = 1 << 15,
NVME_RW_FUA = 1 << 14,
NVME_RW_DSM_FREQ_UNSPEC = 0,
NVME_RW_DSM_FREQ_TYPICAL = 1,
NVME_RW_DSM_FREQ_RARE = 2,
NVME_RW_DSM_FREQ_READS = 3,
NVME_RW_DSM_FREQ_WRITES = 4,
NVME_RW_DSM_FREQ_RW = 5,
NVME_RW_DSM_FREQ_ONCE = 6,
NVME_RW_DSM_FREQ_PREFETCH = 7,
NVME_RW_DSM_FREQ_TEMP = 8,
NVME_RW_DSM_LATENCY_NONE = 0 << 4,
NVME_RW_DSM_LATENCY_IDLE = 1 << 4,
NVME_RW_DSM_LATENCY_NORM = 2 << 4,
NVME_RW_DSM_LATENCY_LOW = 3 << 4,
NVME_RW_DSM_SEQ_REQ = 1 << 6,
NVME_RW_DSM_COMPRESSED = 1 << 7,
NVME_RW_PRINFO_PRACT = 1 << 13,
NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
NVME_RW_PRINFO_PRCHK_APP = 1 << 11,
NVME_RW_PRINFO_PRCHK_REF = 1 << 10,
};
typedef struct NvmeDsmCmd {
uint8_t opcode;
uint8_t flags;
uint16_t cid;
uint32_t nsid;
uint64_t rsvd2[2];
uint64_t prp1;
uint64_t prp2;
uint32_t nr;
uint32_t attributes;
uint32_t rsvd12[4];
} NvmeDsmCmd;
enum {
NVME_DSMGMT_IDR = 1 << 0,
NVME_DSMGMT_IDW = 1 << 1,
NVME_DSMGMT_AD = 1 << 2,
};
typedef struct NvmeDsmRange {
uint32_t cattr;
uint32_t nlb;
uint64_t slba;
} NvmeDsmRange;
enum NvmeAsyncEventRequest {
NVME_AER_TYPE_ERROR = 0,
NVME_AER_TYPE_SMART = 1,
NVME_AER_TYPE_IO_SPECIFIC = 6,
NVME_AER_TYPE_VENDOR_SPECIFIC = 7,
NVME_AER_INFO_ERR_INVALID_SQ = 0,
NVME_AER_INFO_ERR_INVALID_DB = 1,
NVME_AER_INFO_ERR_DIAG_FAIL = 2,
NVME_AER_INFO_ERR_PERS_INTERNAL_ERR = 3,
NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR = 4,
NVME_AER_INFO_ERR_FW_IMG_LOAD_ERR = 5,
NVME_AER_INFO_SMART_RELIABILITY = 0,
NVME_AER_INFO_SMART_TEMP_THRESH = 1,
NVME_AER_INFO_SMART_SPARE_THRESH = 2,
};
typedef struct NvmeAerResult {
uint8_t event_type;
uint8_t event_info;
uint8_t log_page;
uint8_t resv;
} NvmeAerResult;
typedef struct NvmeCqe {
uint32_t result;
uint32_t rsvd;
uint16_t sq_head;
uint16_t sq_id;
uint16_t cid;
uint16_t status;
} NvmeCqe;
enum NvmeStatusCodes {
NVME_SUCCESS = 0x0000,
NVME_INVALID_OPCODE = 0x0001,
NVME_INVALID_FIELD = 0x0002,
NVME_CID_CONFLICT = 0x0003,
NVME_DATA_TRAS_ERROR = 0x0004,
NVME_POWER_LOSS_ABORT = 0x0005,
NVME_INTERNAL_DEV_ERROR = 0x0006,
NVME_CMD_ABORT_REQ = 0x0007,
NVME_CMD_ABORT_SQ_DEL = 0x0008,
NVME_CMD_ABORT_FAILED_FUSE = 0x0009,
NVME_CMD_ABORT_MISSING_FUSE = 0x000a,
NVME_INVALID_NSID = 0x000b,
NVME_CMD_SEQ_ERROR = 0x000c,
NVME_LBA_RANGE = 0x0080,
NVME_CAP_EXCEEDED = 0x0081,
NVME_NS_NOT_READY = 0x0082,
NVME_NS_RESV_CONFLICT = 0x0083,
NVME_INVALID_CQID = 0x0100,
NVME_INVALID_QID = 0x0101,
NVME_MAX_QSIZE_EXCEEDED = 0x0102,
NVME_ACL_EXCEEDED = 0x0103,
NVME_RESERVED = 0x0104,
NVME_AER_LIMIT_EXCEEDED = 0x0105,
NVME_INVALID_FW_SLOT = 0x0106,
NVME_INVALID_FW_IMAGE = 0x0107,
NVME_INVALID_IRQ_VECTOR = 0x0108,
NVME_INVALID_LOG_ID = 0x0109,
NVME_INVALID_FORMAT = 0x010a,
NVME_FW_REQ_RESET = 0x010b,
NVME_INVALID_QUEUE_DEL = 0x010c,
NVME_FID_NOT_SAVEABLE = 0x010d,
NVME_FID_NOT_NSID_SPEC = 0x010f,
NVME_FW_REQ_SUSYSTEM_RESET = 0x0110,
NVME_CONFLICTING_ATTRS = 0x0180,
NVME_INVALID_PROT_INFO = 0x0181,
NVME_WRITE_TO_RO = 0x0182,
NVME_WRITE_FAULT = 0x0280,
NVME_UNRECOVERED_READ = 0x0281,
NVME_E2E_GUARD_ERROR = 0x0282,
NVME_E2E_APP_ERROR = 0x0283,
NVME_E2E_REF_ERROR = 0x0284,
NVME_CMP_FAILURE = 0x0285,
NVME_ACCESS_DENIED = 0x0286,
NVME_MORE = 0x2000,
NVME_DNR = 0x4000,
NVME_NO_COMPLETE = 0xffff,
};
typedef struct NvmeFwSlotInfoLog {
uint8_t afi;
uint8_t reserved1[7];
uint8_t frs1[8];
uint8_t frs2[8];
uint8_t frs3[8];
uint8_t frs4[8];
uint8_t frs5[8];
uint8_t frs6[8];
uint8_t frs7[8];
uint8_t reserved2[448];
} NvmeFwSlotInfoLog;
typedef struct NvmeErrorLog {
uint64_t error_count;
uint16_t sqid;
uint16_t cid;
uint16_t status_field;
uint16_t param_error_location;
uint64_t lba;
uint32_t nsid;
uint8_t vs;
uint8_t resv[35];
} NvmeErrorLog;
typedef struct NvmeSmartLog {
uint8_t critical_warning;
uint8_t temperature[2];
uint8_t available_spare;
uint8_t available_spare_threshold;
uint8_t percentage_used;
uint8_t reserved1[26];
uint64_t data_units_read[2];
uint64_t data_units_written[2];
uint64_t host_read_commands[2];
uint64_t host_write_commands[2];
uint64_t controller_busy_time[2];
uint64_t power_cycles[2];
uint64_t power_on_hours[2];
uint64_t unsafe_shutdowns[2];
uint64_t media_errors[2];
uint64_t number_of_error_log_entries[2];
uint8_t reserved2[320];
} NvmeSmartLog;
enum NvmeSmartWarn {
NVME_SMART_SPARE = 1 << 0,
NVME_SMART_TEMPERATURE = 1 << 1,
NVME_SMART_RELIABILITY = 1 << 2,
NVME_SMART_MEDIA_READ_ONLY = 1 << 3,
NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4,
};
enum LogIdentifier {
NVME_LOG_ERROR_INFO = 0x01,
NVME_LOG_SMART_INFO = 0x02,
NVME_LOG_FW_SLOT_INFO = 0x03,
};
typedef struct NvmePSD {
uint16_t mp;
uint16_t reserved;
uint32_t enlat;
uint32_t exlat;
uint8_t rrt;
uint8_t rrl;
uint8_t rwt;
uint8_t rwl;
uint8_t resv[16];
} NvmePSD;
typedef struct NvmeIdCtrl {
uint16_t vid;
uint16_t ssvid;
uint8_t sn[20];
uint8_t mn[40];
uint8_t fr[8];
uint8_t rab;
uint8_t ieee[3];
uint8_t cmic;
uint8_t mdts;
uint8_t rsvd255[178];
uint16_t oacs;
uint8_t acl;
uint8_t aerl;
uint8_t frmw;
uint8_t lpa;
uint8_t elpe;
uint8_t npss;
uint8_t rsvd511[248];
uint8_t sqes;
uint8_t cqes;
uint16_t rsvd515;
uint32_t nn;
uint16_t oncs;
uint16_t fuses;
uint8_t fna;
uint8_t vwc;
uint16_t awun;
uint16_t awupf;
uint8_t rsvd703[174];
uint8_t rsvd2047[1344];
NvmePSD psd[32];
uint8_t vs[1024];
} NvmeIdCtrl;
enum NvmeIdCtrlOacs {
NVME_OACS_SECURITY = 1 << 0,
NVME_OACS_FORMAT = 1 << 1,
NVME_OACS_FW = 1 << 2,
};
enum NvmeIdCtrlOncs {
NVME_ONCS_COMPARE = 1 << 0,
NVME_ONCS_WRITE_UNCORR = 1 << 1,
NVME_ONCS_DSM = 1 << 2,
NVME_ONCS_WRITE_ZEROS = 1 << 3,
NVME_ONCS_FEATURES = 1 << 4,
NVME_ONCS_RESRVATIONS = 1 << 5,
};
#define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf)
#define NVME_CTRL_SQES_MAX(sqes) (((sqes) >> 4) & 0xf)
#define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf)
#define NVME_CTRL_CQES_MAX(cqes) (((cqes) >> 4) & 0xf)
typedef struct NvmeFeatureVal {
uint32_t arbitration;
uint32_t power_mgmt;
uint32_t temp_thresh;
uint32_t err_rec;
uint32_t volatile_wc;
uint32_t num_queues;
uint32_t int_coalescing;
uint32_t *int_vector_config;
uint32_t write_atomicity;
uint32_t async_config;
uint32_t sw_prog_marker;
} NvmeFeatureVal;
#define NVME_ARB_AB(arb) (arb & 0x7)
#define NVME_ARB_LPW(arb) ((arb >> 8) & 0xff)
#define NVME_ARB_MPW(arb) ((arb >> 16) & 0xff)
#define NVME_ARB_HPW(arb) ((arb >> 24) & 0xff)
#define NVME_INTC_THR(intc) (intc & 0xff)
#define NVME_INTC_TIME(intc) ((intc >> 8) & 0xff)
enum NvmeFeatureIds {
NVME_ARBITRATION = 0x1,
NVME_POWER_MANAGEMENT = 0x2,
NVME_LBA_RANGE_TYPE = 0x3,
NVME_TEMPERATURE_THRESHOLD = 0x4,
NVME_ERROR_RECOVERY = 0x5,
NVME_VOLATILE_WRITE_CACHE = 0x6,
NVME_NUMBER_OF_QUEUES = 0x7,
NVME_INTERRUPT_COALESCING = 0x8,
NVME_INTERRUPT_VECTOR_CONF = 0x9,
NVME_WRITE_ATOMICITY = 0xa,
NVME_ASYNCHRONOUS_EVENT_CONF = 0xb,
NVME_SOFTWARE_PROGRESS_MARKER = 0x80
};
typedef struct NvmeRangeType {
uint8_t type;
uint8_t attributes;
uint8_t rsvd2[14];
uint64_t slba;
uint64_t nlb;
uint8_t guid[16];
uint8_t rsvd48[16];
} NvmeRangeType;
typedef struct NvmeLBAF {
uint16_t ms;
uint8_t ds;
uint8_t rp;
} NvmeLBAF;
typedef struct NvmeIdNs {
uint64_t nsze;
uint64_t ncap;
uint64_t nuse;
uint8_t nsfeat;
uint8_t nlbaf;
uint8_t flbas;
uint8_t mc;
uint8_t dpc;
uint8_t dps;
uint8_t res30[98];
NvmeLBAF lbaf[16];
uint8_t res192[192];
uint8_t vs[3712];
} NvmeIdNs;
#define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1))
#define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1)
#define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf))
#define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1)
#define NVME_ID_NS_MC_EXTENDED(mc) ((mc & 0x1))
#define NVME_ID_NS_DPC_LAST_EIGHT(dpc) ((dpc >> 4) & 0x1)
#define NVME_ID_NS_DPC_FIRST_EIGHT(dpc) ((dpc >> 3) & 0x1)
#define NVME_ID_NS_DPC_TYPE_3(dpc) ((dpc >> 2) & 0x1)
#define NVME_ID_NS_DPC_TYPE_2(dpc) ((dpc >> 1) & 0x1)
#define NVME_ID_NS_DPC_TYPE_1(dpc) ((dpc & 0x1))
#define NVME_ID_NS_DPC_TYPE_MASK 0x7
enum NvmeIdNsDps {
DPS_TYPE_NONE = 0,
DPS_TYPE_1 = 1,
DPS_TYPE_2 = 2,
DPS_TYPE_3 = 3,
DPS_TYPE_MASK = 0x7,
DPS_FIRST_EIGHT = 8,
};
static inline void _nvme_check_size(void)
{
QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4);
QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16);
QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16);
QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeCreateSq) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeIdentify) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeRwCmd) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
}
#endif

View File

@ -114,5 +114,44 @@
#ifndef __has_feature
#define __has_feature(x) 0 /* compatibility with non-clang compilers */
#endif
/* Implement C11 _Generic via GCC builtins. Example:
*
* QEMU_GENERIC(x, (float, sinf), (long double, sinl), sin) (x)
*
* The first argument is the discriminator. The last is the default value.
* The middle ones are tuples in "(type, expansion)" format.
*/
/* First, find out the number of generic cases. */
#define QEMU_GENERIC(x, ...) \
QEMU_GENERIC_(typeof(x), __VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
/* There will be extra arguments, but they are not used. */
#define QEMU_GENERIC_(x, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, count, ...) \
QEMU_GENERIC##count(x, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)
/* Two more helper macros, this time to extract items from a parenthesized
* list.
*/
#define QEMU_FIRST_(a, b) a
#define QEMU_SECOND_(a, b) b
/* ... and a final one for the common part of the "recursion". */
#define QEMU_GENERIC_IF(x, type_then, else_) \
__builtin_choose_expr(__builtin_types_compatible_p(x, \
QEMU_FIRST_ type_then), \
QEMU_SECOND_ type_then, else_)
/* CPP poor man's "recursion". */
#define QEMU_GENERIC1(x, a0, ...) (a0)
#define QEMU_GENERIC2(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC1(x, __VA_ARGS__))
#define QEMU_GENERIC3(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC2(x, __VA_ARGS__))
#define QEMU_GENERIC4(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC3(x, __VA_ARGS__))
#define QEMU_GENERIC5(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC4(x, __VA_ARGS__))
#define QEMU_GENERIC6(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC5(x, __VA_ARGS__))
#define QEMU_GENERIC7(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC6(x, __VA_ARGS__))
#define QEMU_GENERIC8(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC7(x, __VA_ARGS__))
#define QEMU_GENERIC9(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC8(x, __VA_ARGS__))
#define QEMU_GENERIC10(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC9(x, __VA_ARGS__))
#endif /* COMPILER_H */

View File

@ -121,7 +121,7 @@ bool qemu_coroutine_entered(Coroutine *co);
* Provides a mutex that can be used to synchronise coroutines
*/
struct CoWaitRecord;
typedef struct CoMutex {
struct CoMutex {
/* Count of pending lockers; 0 for a free mutex, 1 for an
* uncontended mutex.
*/
@ -142,7 +142,7 @@ typedef struct CoMutex {
unsigned handoff, sequence;
Coroutine *holder;
} CoMutex;
};
/**
* Initialises a CoMutex. This must be called before any other operation is used
@ -183,24 +183,33 @@ void qemu_co_queue_init(CoQueue *queue);
* caller of the coroutine. The mutex is unlocked during the wait and
* locked again afterwards.
*/
void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex);
#define qemu_co_queue_wait(queue, lock) \
qemu_co_queue_wait_impl(queue, QEMU_MAKE_LOCKABLE(lock))
void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock);
/**
* Restarts the next coroutine in the CoQueue and removes it from the queue.
*
* Returns true if a coroutine was restarted, false if the queue is empty.
* Removes the next coroutine from the CoQueue, and wake it up.
* Returns true if a coroutine was removed, false if the queue is empty.
*/
bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
/**
* Restarts all coroutines in the CoQueue and leaves the queue empty.
* Empties the CoQueue; all coroutines are woken up.
*/
void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
/**
* Enter the next coroutine in the queue
* Removes the next coroutine from the CoQueue, and wake it up. Unlike
* qemu_co_queue_next, this function releases the lock during aio_co_wake
* because it is meant to be used outside coroutine context; in that case, the
* coroutine is entered immediately, before qemu_co_enter_next returns.
*
* If used in coroutine context, qemu_co_enter_next is equivalent to
* qemu_co_queue_next.
*/
bool qemu_co_enter_next(CoQueue *queue);
#define qemu_co_enter_next(queue, lock) \
qemu_co_enter_next_impl(queue, QEMU_MAKE_LOCKABLE(lock))
bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock);
/**
* Checks if the CoQueue is empty.
@ -271,4 +280,6 @@ void coroutine_fn qemu_co_sleep_ns(QEMUClockType type, int64_t ns);
*/
void coroutine_fn yield_until_fd_readable(int fd);
#include "qemu/lockable.h"
#endif /* QEMU_COROUTINE_H */

96
include/qemu/lockable.h Normal file
View File

@ -0,0 +1,96 @@
/*
* Polymorphic locking functions (aka poor man templates)
*
* Copyright Red Hat, Inc. 2017, 2018
*
* Author: Paolo Bonzini <pbonzini@redhat.com>
*
* This work is licensed under the terms of the GNU LGPL, version 2 or later.
* See the COPYING.LIB file in the top-level directory.
*
*/
#ifndef QEMU_LOCKABLE_H
#define QEMU_LOCKABLE_H
#include "qemu/coroutine.h"
#include "qemu/thread.h"
typedef void QemuLockUnlockFunc(void *);
struct QemuLockable {
void *object;
QemuLockUnlockFunc *lock;
QemuLockUnlockFunc *unlock;
};
/* This function gives an error if an invalid, non-NULL pointer type is passed
* to QEMU_MAKE_LOCKABLE. For optimized builds, we can rely on dead-code elimination
* from the compiler, and give the errors already at link time.
*/
#ifdef __OPTIMIZE__
void unknown_lock_type(void *);
#else
static inline void unknown_lock_type(void *unused)
{
abort();
}
#endif
static inline __attribute__((__always_inline__)) QemuLockable *
qemu_make_lockable(void *x, QemuLockable *lockable)
{
/* We cannot test this in a macro, otherwise we get compiler
* warnings like "the address of 'm' will always evaluate as 'true'".
*/
return x ? lockable : NULL;
}
/* Auxiliary macros to simplify QEMU_MAKE_LOCABLE. */
#define QEMU_LOCK_FUNC(x) ((QemuLockUnlockFunc *) \
QEMU_GENERIC(x, \
(QemuMutex *, qemu_mutex_lock), \
(CoMutex *, qemu_co_mutex_lock), \
(QemuSpin *, qemu_spin_lock), \
unknown_lock_type))
#define QEMU_UNLOCK_FUNC(x) ((QemuLockUnlockFunc *) \
QEMU_GENERIC(x, \
(QemuMutex *, qemu_mutex_unlock), \
(CoMutex *, qemu_co_mutex_unlock), \
(QemuSpin *, qemu_spin_unlock), \
unknown_lock_type))
/* In C, compound literals have the lifetime of an automatic variable.
* In C++ it would be different, but then C++ wouldn't need QemuLockable
* either...
*/
#define QEMU_MAKE_LOCKABLE_(x) qemu_make_lockable((x), &(QemuLockable) { \
.object = (x), \
.lock = QEMU_LOCK_FUNC(x), \
.unlock = QEMU_UNLOCK_FUNC(x), \
})
/* QEMU_MAKE_LOCKABLE - Make a polymorphic QemuLockable
*
* @x: a lock object (currently one of QemuMutex, CoMutex, QemuSpin).
*
* Returns a QemuLockable object that can be passed around
* to a function that can operate with locks of any kind.
*/
#define QEMU_MAKE_LOCKABLE(x) \
QEMU_GENERIC(x, \
(QemuLockable *, (x)), \
QEMU_MAKE_LOCKABLE_(x))
static inline void qemu_lockable_lock(QemuLockable *x)
{
x->lock(x->object);
}
static inline void qemu_lockable_unlock(QemuLockable *x)
{
x->unlock(x->object);
}
#endif

View File

@ -4,7 +4,6 @@
#include "qemu/processor.h"
#include "qemu/atomic.h"
typedef struct QemuMutex QemuMutex;
typedef struct QemuCond QemuCond;
typedef struct QemuSemaphore QemuSemaphore;
typedef struct QemuEvent QemuEvent;
@ -97,9 +96,9 @@ struct Notifier;
void qemu_thread_atexit_add(struct Notifier *notifier);
void qemu_thread_atexit_remove(struct Notifier *notifier);
typedef struct QemuSpin {
struct QemuSpin {
int value;
} QemuSpin;
};
static inline void qemu_spin_init(QemuSpin *spin)
{

View File

@ -19,6 +19,7 @@ typedef struct BusClass BusClass;
typedef struct BusState BusState;
typedef struct Chardev Chardev;
typedef struct CompatProperty CompatProperty;
typedef struct CoMutex CoMutex;
typedef struct CPUAddressSpace CPUAddressSpace;
typedef struct CPUState CPUState;
typedef struct DeviceListener DeviceListener;
@ -86,9 +87,12 @@ typedef struct QEMUBH QEMUBH;
typedef struct QemuConsole QemuConsole;
typedef struct QemuDmaBuf QemuDmaBuf;
typedef struct QEMUFile QEMUFile;
typedef struct QemuLockable QemuLockable;
typedef struct QemuMutex QemuMutex;
typedef struct QemuOpt QemuOpt;
typedef struct QemuOpts QemuOpts;
typedef struct QemuOptsList QemuOptsList;
typedef struct QemuSpin QemuSpin;
typedef struct QEMUSGList QEMUSGList;
typedef struct QEMUTimer QEMUTimer;
typedef struct QEMUTimerListGroup QEMUTimerListGroup;

View File

@ -0,0 +1,33 @@
/*
* QEMU VFIO helpers
*
* Copyright 2016 - 2018 Red Hat, Inc.
*
* Authors:
* Fam Zheng <famz@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#ifndef QEMU_VFIO_HELPERS_H
#define QEMU_VFIO_HELPERS_H
#include "qemu/typedefs.h"
typedef struct QEMUVFIOState QEMUVFIOState;
QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp);
void qemu_vfio_close(QEMUVFIOState *s);
int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
bool temporary, uint64_t *iova_list);
int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s);
void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host);
void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
uint64_t offset, uint64_t size,
Error **errp);
void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
uint64_t offset, uint64_t size);
int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
int irq_type, Error **errp);
#endif

View File

@ -229,4 +229,7 @@ void blk_io_limits_enable(BlockBackend *blk, const char *group);
void blk_io_limits_update_group(BlockBackend *blk, const char *group);
void blk_set_force_allow_inactivate(BlockBackend *blk);
void blk_register_buf(BlockBackend *blk, void *host, size_t size);
void blk_unregister_buf(BlockBackend *blk, void *host);
#endif

View File

@ -2248,6 +2248,7 @@
#
# @vxhs: Since 2.10
# @throttle: Since 2.11
# @nvme: Since 2.12
#
# Since: 2.9
##
@ -2255,7 +2256,7 @@
'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop',
'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom',
'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
'null-aio', 'null-co', 'parallels', 'qcow', 'qcow2', 'qed',
'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
@ -2296,6 +2297,19 @@
{ 'struct': 'BlockdevOptionsNull',
'data': { '*size': 'int', '*latency-ns': 'uint64' } }
##
# @BlockdevOptionsNVMe:
#
# Driver specific block device options for the NVMe backend.
#
# @device: controller address of the NVMe device.
# @namespace: namespace number of the device, starting from 1.
#
# Since: 2.12
##
{ 'struct': 'BlockdevOptionsNVMe',
'data': { 'device': 'str', 'namespace': 'int' } }
##
# @BlockdevOptionsVVFAT:
#
@ -3201,6 +3215,7 @@
'nfs': 'BlockdevOptionsNfs',
'null-aio': 'BlockdevOptionsNull',
'null-co': 'BlockdevOptionsNull',
'nvme': 'BlockdevOptionsNVMe',
'parallels': 'BlockdevOptionsGenericFormat',
'qcow2': 'BlockdevOptionsQcow2',
'qcow': 'BlockdevOptionsQcow',

View File

@ -621,6 +621,7 @@ encrypted disk images.
* disk_images_iscsi:: iSCSI LUNs
* disk_images_gluster:: GlusterFS disk images
* disk_images_ssh:: Secure Shell (ssh) disk images
* disk_images_nvme:: NVMe userspace driver
* disk_image_locking:: Disk image file locking
@end menu

View File

@ -3862,6 +3862,7 @@ static int img_bench(int argc, char **argv)
struct timeval t1, t2;
int i;
bool force_share = false;
size_t buf_size;
for (;;) {
static const struct option long_options[] = {
@ -4050,9 +4051,12 @@ static int img_bench(int argc, char **argv)
printf("Sending flush every %d requests\n", flush_interval);
}
data.buf = blk_blockalign(blk, data.nrreq * data.bufsize);
buf_size = data.nrreq * data.bufsize;
data.buf = blk_blockalign(blk, buf_size);
memset(data.buf, pattern, data.nrreq * data.bufsize);
blk_register_buf(blk, data.buf, buf_size);
data.qiov = g_new(QEMUIOVector, data.nrreq);
for (i = 0; i < data.nrreq; i++) {
qemu_iovec_init(&data.qiov[i], 1);
@ -4073,6 +4077,9 @@ static int img_bench(int argc, char **argv)
+ ((double)(t2.tv_usec - t1.tv_usec) / 1000000));
out:
if (data.buf) {
blk_unregister_buf(blk, data.buf);
}
qemu_vfree(data.buf);
blk_unref(blk);

View File

@ -42,3 +42,4 @@ stub-obj-y += vmgenid.o
stub-obj-y += xen-common.o
stub-obj-y += xen-hvm.o
stub-obj-y += pci-host-piix.o
stub-obj-y += ram-block.o

16
stubs/ram-block.c Normal file
View File

@ -0,0 +1,16 @@
#include "qemu/osdep.h"
#include "exec/ramlist.h"
#include "exec/cpu-common.h"
void ram_block_notifier_add(RAMBlockNotifier *n)
{
}
void ram_block_notifier_remove(RAMBlockNotifier *n)
{
}
int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
{
return 0;
}

View File

@ -1,4 +1,4 @@
FROM fedora:latest
FROM fedora:27
ENV PACKAGES \
ccache gettext git tar PyYAML sparse flex bison python3 bzip2 hostname \
glib2-devel pixman-devel zlib-devel SDL-devel libfdt-devel \

View File

@ -14,6 +14,7 @@
#include "qemu/osdep.h"
#include "qemu/coroutine.h"
#include "qemu/coroutine_int.h"
#include "qemu/lockable.h"
/*
* Check that qemu_in_coroutine() works
@ -175,7 +176,7 @@ static void coroutine_fn c1_fn(void *opaque)
qemu_coroutine_enter(c2);
}
static void test_co_queue(void)
static void test_no_dangling_access(void)
{
Coroutine *c1;
Coroutine *c2;
@ -195,6 +196,74 @@ static void test_co_queue(void)
*c1 = tmp;
}
static bool locked;
static int done;
static void coroutine_fn mutex_fn(void *opaque)
{
CoMutex *m = opaque;
qemu_co_mutex_lock(m);
assert(!locked);
locked = true;
qemu_coroutine_yield();
locked = false;
qemu_co_mutex_unlock(m);
done++;
}
static void coroutine_fn lockable_fn(void *opaque)
{
QemuLockable *x = opaque;
qemu_lockable_lock(x);
assert(!locked);
locked = true;
qemu_coroutine_yield();
locked = false;
qemu_lockable_unlock(x);
done++;
}
static void do_test_co_mutex(CoroutineEntry *entry, void *opaque)
{
Coroutine *c1 = qemu_coroutine_create(entry, opaque);
Coroutine *c2 = qemu_coroutine_create(entry, opaque);
done = 0;
qemu_coroutine_enter(c1);
g_assert(locked);
qemu_coroutine_enter(c2);
/* Unlock queues c2. It is then started automatically when c1 yields or
* terminates.
*/
qemu_coroutine_enter(c1);
g_assert_cmpint(done, ==, 1);
g_assert(locked);
qemu_coroutine_enter(c2);
g_assert_cmpint(done, ==, 2);
g_assert(!locked);
}
static void test_co_mutex(void)
{
CoMutex m;
qemu_co_mutex_init(&m);
do_test_co_mutex(mutex_fn, &m);
}
static void test_co_mutex_lockable(void)
{
CoMutex m;
CoMutex *null_pointer = NULL;
qemu_co_mutex_init(&m);
do_test_co_mutex(lockable_fn, QEMU_MAKE_LOCKABLE(&m));
g_assert(QEMU_MAKE_LOCKABLE(null_pointer) == NULL);
}
/*
* Check that creation, enter, and return work
*/
@ -422,7 +491,7 @@ int main(int argc, char **argv)
* crash, so skip it.
*/
if (CONFIG_COROUTINE_POOL) {
g_test_add_func("/basic/co_queue", test_co_queue);
g_test_add_func("/basic/no-dangling-access", test_no_dangling_access);
}
g_test_add_func("/basic/lifecycle", test_lifecycle);
@ -432,6 +501,8 @@ int main(int argc, char **argv)
g_test_add_func("/basic/entered", test_entered);
g_test_add_func("/basic/in_coroutine", test_in_coroutine);
g_test_add_func("/basic/order", test_order);
g_test_add_func("/locking/co-mutex", test_co_mutex);
g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable);
if (g_test_perf()) {
g_test_add_func("/perf/lifecycle", perf_lifecycle);
g_test_add_func("/perf/nesting", perf_nesting);

View File

@ -1,89 +1 @@
=== VM test suite to run build in guests ===
== Intro ==
This test suite contains scripts that bootstrap various guest images that have
necessary packages to build QEMU. The basic usage is documented in Makefile
help which is displayed with "make vm-test".
== Quick start ==
Run "make vm-test" to list available make targets. Invoke a specific make
command to run build test in an image. For example, "make vm-build-freebsd"
will build the source tree in the FreeBSD image. The command can be executed
from either the source tree or the build dir; if the former, ./configure is not
needed. The command will then generate the test image in ./tests/vm/ under the
working directory.
Note: images created by the scripts accept a well-known RSA key pair for SSH
access, so they SHOULD NOT be exposed to external interfaces if you are
concerned about attackers taking control of the guest and potentially
exploiting a QEMU security bug to compromise the host.
== QEMU binary ==
By default, qemu-system-x86_64 is searched in $PATH to run the guest. If there
isn't one, or if it is older than 2.10, the test won't work. In this case,
provide the QEMU binary in env var: QEMU=/path/to/qemu-2.10+.
== Make jobs ==
The "-j$X" option in the make command line is not propagated into the VM,
specify "J=$X" to control the make jobs in the guest.
== Debugging ==
Add "DEBUG=1" and/or "V=1" to the make command to allow interactive debugging
and verbose output. If this is not enough, see the next section.
== Manual invocation ==
Each guest script is an executable script with the same command line options.
For example to work with the netbsd guest, use $QEMU_SRC/tests/vm/netbsd:
$ cd $QEMU_SRC/tests/vm
# To bootstrap the image
$ ./netbsd --build-image --image /var/tmp/netbsd.img
<...>
# To run an arbitrary command in guest (the output will not be echoed unless
# --debug is added)
$ ./netbsd --debug --image /var/tmp/netbsd.img uname -a
# To build QEMU in guest
$ ./netbsd --debug --image /var/tmp/netbsd.img --build-qemu $QEMU_SRC
# To get to an interactive shell
$ ./netbsd --interactive --image /var/tmp/netbsd.img sh
== Adding new guests ==
Please look at existing guest scripts for how to add new guests.
Most importantly, create a subclass of BaseVM and implement build_image()
method and define BUILD_SCRIPT, then finally call basevm.main() from the
script's main().
- Usually in build_image(), a template image is downloaded from a predefined
URL. BaseVM._download_with_cache() takes care of the cache and the
checksum, so consider using it.
- Once the image is downloaded, users, SSH server and QEMU build deps should
be set up:
* Root password set to BaseVM.ROOT_PASS
* User BaseVM.GUEST_USER is created, and password set to BaseVM.GUEST_PASS
* SSH service is enabled and started on boot,
$QEMU_SRC/tests/keys/id_rsa.pub is added to ssh's "authorized_keys" file
of both root and the normal user
* DHCP client service is enabled and started on boot, so that it can
automatically configure the virtio-net-pci NIC and communicate with QEMU
user net (10.0.2.2)
* Necessary packages are installed to untar the source tarball and build
QEMU
- Write a proper BUILD_SCRIPT template, which should be a shell script that
untars a raw virtio-blk block device, which is the tarball data blob of the
QEMU source tree, then configure/build it. Running "make check" is also
recommended.
See docs/devel/testing.rst for help.

View File

@ -46,3 +46,4 @@ util-obj-y += qht.o
util-obj-y += range.o
util-obj-y += stats64.o
util-obj-y += systemd.o
util-obj-$(CONFIG_LINUX) += vfio-helpers.o

View File

@ -40,13 +40,13 @@ void qemu_co_queue_init(CoQueue *queue)
QSIMPLEQ_INIT(&queue->entries);
}
void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex)
void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock)
{
Coroutine *self = qemu_coroutine_self();
QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
if (mutex) {
qemu_co_mutex_unlock(mutex);
if (lock) {
qemu_lockable_unlock(lock);
}
/* There is no race condition here. Other threads will call
@ -60,9 +60,11 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex)
/* TODO: OSv implements wait morphing here, where the wakeup
* primitive automatically places the woken coroutine on the
* mutex's queue. This avoids the thundering herd effect.
* This could be implemented for CoMutexes, but not really for
* other cases of QemuLockable.
*/
if (mutex) {
qemu_co_mutex_lock(mutex);
if (lock) {
qemu_lockable_lock(lock);
}
}
@ -130,7 +132,7 @@ void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue)
qemu_co_queue_do_restart(queue, false);
}
bool qemu_co_enter_next(CoQueue *queue)
bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock)
{
Coroutine *next;
@ -140,7 +142,13 @@ bool qemu_co_enter_next(CoQueue *queue)
}
QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
qemu_coroutine_enter(next);
if (lock) {
qemu_lockable_unlock(lock);
}
aio_co_wake(next);
if (lock) {
qemu_lockable_lock(lock);
}
return true;
}

View File

@ -60,3 +60,14 @@ lockcnt_futex_wake(const void *lockcnt) "lockcnt %p waking up one waiter"
qemu_mutex_lock(void *mutex, const char *file, const int line) "waiting on mutex %p (%s:%d)"
qemu_mutex_locked(void *mutex, const char *file, const int line) "taken mutex %p (%s:%d)"
qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex %p (%s:%d)"
# util/vfio-helpers.c
qemu_vfio_dma_reset_temporary(void *s) "s %p"
qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size %zu index %d iova 0x%"PRIx64
qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size %zu iova 0x%"PRIx64
qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size %zu temporary %d iova %p"
qemu_vfio_dma_map_invalid(void *s, void *mapping_host, size_t mapping_size, void *host, size_t size) "s %p mapping %p %zu requested %p %zu"
qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"

727
util/vfio-helpers.c Normal file
View File

@ -0,0 +1,727 @@
/*
* VFIO utility
*
* Copyright 2016 - 2018 Red Hat, Inc.
*
* Authors:
* Fam Zheng <famz@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#include "qemu/osdep.h"
#include <sys/ioctl.h>
#include <linux/vfio.h>
#include "qapi/error.h"
#include "exec/ramlist.h"
#include "exec/cpu-common.h"
#include "trace.h"
#include "qemu/queue.h"
#include "qemu/error-report.h"
#include "standard-headers/linux/pci_regs.h"
#include "qemu/event_notifier.h"
#include "qemu/vfio-helpers.h"
#include "trace.h"
#define QEMU_VFIO_DEBUG 0
#define QEMU_VFIO_IOVA_MIN 0x10000ULL
/* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
* we can use a runtime limit; alternatively it's also possible to do platform
* specific detection by reading sysfs entries. Until then, 39 is a safe bet.
**/
#define QEMU_VFIO_IOVA_MAX (1ULL << 39)
typedef struct {
/* Page aligned addr. */
void *host;
size_t size;
uint64_t iova;
} IOVAMapping;
struct QEMUVFIOState {
QemuMutex lock;
/* These fields are protected by BQL */
int container;
int group;
int device;
RAMBlockNotifier ram_notifier;
struct vfio_region_info config_region_info, bar_region_info[6];
/* These fields are protected by @lock */
/* VFIO's IO virtual address space is managed by splitting into a few
* sections:
*
* --------------- <= 0
* |xxxxxxxxxxxxx|
* |-------------| <= QEMU_VFIO_IOVA_MIN
* | |
* | Fixed |
* | |
* |-------------| <= low_water_mark
* | |
* | Free |
* | |
* |-------------| <= high_water_mark
* | |
* | Temp |
* | |
* |-------------| <= QEMU_VFIO_IOVA_MAX
* |xxxxxxxxxxxxx|
* |xxxxxxxxxxxxx|
* ---------------
*
* - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
*
* - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
* [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be
* reclaimed - low_water_mark never shrinks;
*
* - IOVAs in range [low_water_mark, high_water_mark) are free;
*
* - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
* mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
* is recycled. The caller should make sure I/O's depending on these
* mappings are completed before calling.
**/
uint64_t low_water_mark;
uint64_t high_water_mark;
IOVAMapping *mappings;
int nr_mappings;
};
/**
* Find group file by PCI device address as specified @device, and return the
* path. The returned string is owned by caller and should be g_free'ed later.
*/
static char *sysfs_find_group_file(const char *device, Error **errp)
{
char *sysfs_link;
char *sysfs_group;
char *p;
char *path = NULL;
sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device);
sysfs_group = g_malloc(PATH_MAX);
if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) {
error_setg_errno(errp, errno, "Failed to find iommu group sysfs path");
goto out;
}
p = strrchr(sysfs_group, '/');
if (!p) {
error_setg(errp, "Failed to find iommu group number");
goto out;
}
path = g_strdup_printf("/dev/vfio/%s", p + 1);
out:
g_free(sysfs_link);
g_free(sysfs_group);
return path;
}
static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
{
assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info));
}
static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
{
assert_bar_index_valid(s, index);
s->bar_region_info[index] = (struct vfio_region_info) {
.index = VFIO_PCI_BAR0_REGION_INDEX + index,
.argsz = sizeof(struct vfio_region_info),
};
if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) {
error_setg_errno(errp, errno, "Failed to get BAR region info");
return -errno;
}
return 0;
}
/**
* Map a PCI bar area.
*/
void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
uint64_t offset, uint64_t size,
Error **errp)
{
void *p;
assert_bar_index_valid(s, index);
p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
PROT_READ | PROT_WRITE, MAP_SHARED,
s->device, s->bar_region_info[index].offset + offset);
if (p == MAP_FAILED) {
error_setg_errno(errp, errno, "Failed to map BAR region");
p = NULL;
}
return p;
}
/**
* Unmap a PCI bar area.
*/
void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
uint64_t offset, uint64_t size)
{
if (bar) {
munmap(bar, MIN(size, s->bar_region_info[index].size - offset));
}
}
/**
* Initialize device IRQ with @irq_type and and register an event notifier.
*/
int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
int irq_type, Error **errp)
{
int r;
struct vfio_irq_set *irq_set;
size_t irq_set_size;
struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
irq_info.index = irq_type;
if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
error_setg_errno(errp, errno, "Failed to get device interrupt info");
return -errno;
}
if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
error_setg(errp, "Device interrupt doesn't support eventfd");
return -EINVAL;
}
irq_set_size = sizeof(*irq_set) + sizeof(int);
irq_set = g_malloc0(irq_set_size);
/* Get to a known IRQ state */
*irq_set = (struct vfio_irq_set) {
.argsz = irq_set_size,
.flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
.index = irq_info.index,
.start = 0,
.count = 1,
};
*(int *)&irq_set->data = event_notifier_get_fd(e);
r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
g_free(irq_set);
if (r) {
error_setg_errno(errp, errno, "Failed to setup device interrupt");
return -errno;
}
return 0;
}
static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
int size, int ofs)
{
int ret;
do {
ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
} while (ret == -1 && errno == EINTR);
return ret == size ? 0 : -errno;
}
static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs)
{
int ret;
do {
ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
} while (ret == -1 && errno == EINTR);
return ret == size ? 0 : -errno;
}
static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
Error **errp)
{
int ret;
int i;
uint16_t pci_cmd;
struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
char *group_file = NULL;
/* Create a new container */
s->container = open("/dev/vfio/vfio", O_RDWR);
if (s->container == -1) {
error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
return -errno;
}
if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
error_setg(errp, "Invalid VFIO version");
ret = -EINVAL;
goto fail_container;
}
if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
error_setg_errno(errp, errno, "VFIO IOMMU check failed");
ret = -EINVAL;
goto fail_container;
}
/* Open the group */
group_file = sysfs_find_group_file(device, errp);
if (!group_file) {
ret = -EINVAL;
goto fail_container;
}
s->group = open(group_file, O_RDWR);
if (s->group == -1) {
error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
group_file);
g_free(group_file);
ret = -errno;
goto fail_container;
}
g_free(group_file);
/* Test the group is viable and available */
if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
error_setg_errno(errp, errno, "Failed to get VFIO group status");
ret = -errno;
goto fail;
}
if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
error_setg(errp, "VFIO group is not viable");
ret = -EINVAL;
goto fail;
}
/* Add the group to the container */
if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
error_setg_errno(errp, errno, "Failed to add group to VFIO container");
ret = -errno;
goto fail;
}
/* Enable the IOMMU model we want */
if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
ret = -errno;
goto fail;
}
/* Get additional IOMMU info */
if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
error_setg_errno(errp, errno, "Failed to get IOMMU info");
ret = -errno;
goto fail;
}
s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
if (s->device < 0) {
error_setg_errno(errp, errno, "Failed to get device fd");
ret = -errno;
goto fail;
}
/* Test and setup the device */
if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
error_setg_errno(errp, errno, "Failed to get device info");
ret = -errno;
goto fail;
}
if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
error_setg(errp, "Invalid device regions");
ret = -EINVAL;
goto fail;
}
s->config_region_info = (struct vfio_region_info) {
.index = VFIO_PCI_CONFIG_REGION_INDEX,
.argsz = sizeof(struct vfio_region_info),
};
if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) {
error_setg_errno(errp, errno, "Failed to get config region info");
ret = -errno;
goto fail;
}
for (i = 0; i < 6; i++) {
ret = qemu_vfio_pci_init_bar(s, i, errp);
if (ret) {
goto fail;
}
}
/* Enable bus master */
ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
if (ret) {
goto fail;
}
pci_cmd |= PCI_COMMAND_MASTER;
ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
if (ret) {
goto fail;
}
return 0;
fail:
close(s->group);
fail_container:
close(s->container);
return ret;
}
static void qemu_vfio_ram_block_added(RAMBlockNotifier *n,
void *host, size_t size)
{
QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
trace_qemu_vfio_ram_block_added(s, host, size);
qemu_vfio_dma_map(s, host, size, false, NULL);
}
static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n,
void *host, size_t size)
{
QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
if (host) {
trace_qemu_vfio_ram_block_removed(s, host, size);
qemu_vfio_dma_unmap(s, host);
}
}
static int qemu_vfio_init_ramblock(const char *block_name, void *host_addr,
ram_addr_t offset, ram_addr_t length,
void *opaque)
{
int ret;
QEMUVFIOState *s = opaque;
if (!host_addr) {
return 0;
}
ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL);
if (ret) {
fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n",
host_addr, (uint64_t)length);
}
return 0;
}
static void qemu_vfio_open_common(QEMUVFIOState *s)
{
s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
ram_block_notifier_add(&s->ram_notifier);
s->low_water_mark = QEMU_VFIO_IOVA_MIN;
s->high_water_mark = QEMU_VFIO_IOVA_MAX;
qemu_ram_foreach_block(qemu_vfio_init_ramblock, s);
qemu_mutex_init(&s->lock);
}
/**
* Open a PCI device, e.g. "0000:00:01.0".
*/
QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
{
int r;
QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
r = qemu_vfio_init_pci(s, device, errp);
if (r) {
g_free(s);
return NULL;
}
qemu_vfio_open_common(s);
return s;
}
static void qemu_vfio_dump_mapping(IOVAMapping *m)
{
if (QEMU_VFIO_DEBUG) {
printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host,
(uint64_t)m->size, (uint64_t)m->iova);
}
}
static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
{
int i;
if (QEMU_VFIO_DEBUG) {
printf("vfio mappings\n");
for (i = 0; i < s->nr_mappings; ++i) {
qemu_vfio_dump_mapping(&s->mappings[i]);
}
}
}
/**
* Find the mapping entry that contains [host, host + size) and set @index to
* the position. If no entry contains it, @index is the position _after_ which
* to insert the new mapping. IOW, it is the index of the largest element that
* is smaller than @host, or -1 if no entry is.
*/
static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
int *index)
{
IOVAMapping *p = s->mappings;
IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL;
IOVAMapping *mid;
trace_qemu_vfio_find_mapping(s, host);
if (!p) {
*index = -1;
return NULL;
}
while (true) {
mid = p + (q - p) / 2;
if (mid == p) {
break;
}
if (mid->host > host) {
q = mid;
} else if (mid->host < host) {
p = mid;
} else {
break;
}
}
if (mid->host > host) {
mid--;
} else if (mid < &s->mappings[s->nr_mappings - 1]
&& (mid + 1)->host <= host) {
mid++;
}
*index = mid - &s->mappings[0];
if (mid >= &s->mappings[0] &&
mid->host <= host && mid->host + mid->size > host) {
assert(mid < &s->mappings[s->nr_mappings]);
return mid;
}
/* At this point *index + 1 is the right position to insert the new
* mapping.*/
return NULL;
}
/**
* Allocate IOVA and and create a new mapping record and insert it in @s.
*/
static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
void *host, size_t size,
int index, uint64_t iova)
{
int shift;
IOVAMapping m = {.host = host, .size = size, .iova = iova};
IOVAMapping *insert;
assert(QEMU_IS_ALIGNED(size, getpagesize()));
assert(QEMU_IS_ALIGNED(s->low_water_mark, getpagesize()));
assert(QEMU_IS_ALIGNED(s->high_water_mark, getpagesize()));
trace_qemu_vfio_new_mapping(s, host, size, index, iova);
assert(index >= 0);
s->nr_mappings++;
s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]),
s->nr_mappings);
insert = &s->mappings[index];
shift = s->nr_mappings - index - 1;
if (shift) {
memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
}
*insert = m;
return insert;
}
/* Do the DMA mapping with VFIO. */
static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
uint64_t iova)
{
struct vfio_iommu_type1_dma_map dma_map = {
.argsz = sizeof(dma_map),
.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
.iova = iova,
.vaddr = (uintptr_t)host,
.size = size,
};
trace_qemu_vfio_do_mapping(s, host, size, iova);
if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
error_report("VFIO_MAP_DMA: %d", -errno);
return -errno;
}
return 0;
}
/**
* Undo the DMA mapping from @s with VFIO, and remove from mapping list.
*/
static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
Error **errp)
{
int index;
struct vfio_iommu_type1_dma_unmap unmap = {
.argsz = sizeof(unmap),
.flags = 0,
.iova = mapping->iova,
.size = mapping->size,
};
index = mapping - s->mappings;
assert(mapping->size > 0);
assert(QEMU_IS_ALIGNED(mapping->size, getpagesize()));
assert(index >= 0 && index < s->nr_mappings);
if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno);
}
memmove(mapping, &s->mappings[index + 1],
sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
s->nr_mappings--;
s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]),
s->nr_mappings);
}
/* Check if the mapping list is (ascending) ordered. */
static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
{
int i;
if (QEMU_VFIO_DEBUG) {
for (i = 0; i < s->nr_mappings - 1; ++i) {
if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
fprintf(stderr, "item %d not sorted!\n", i);
qemu_vfio_dump_mappings(s);
return false;
}
if (!(s->mappings[i].host + s->mappings[i].size <=
s->mappings[i + 1].host)) {
fprintf(stderr, "item %d overlap with next!\n", i);
qemu_vfio_dump_mappings(s);
return false;
}
}
}
return true;
}
/* Map [host, host + size) area into a contiguous IOVA address space, and store
* the result in @iova if not NULL. The caller need to make sure the area is
* aligned to page size, and mustn't overlap with existing mapping areas (split
* mapping status within this area is not allowed).
*/
int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
bool temporary, uint64_t *iova)
{
int ret = 0;
int index;
IOVAMapping *mapping;
uint64_t iova0;
assert(QEMU_PTR_IS_ALIGNED(host, getpagesize()));
assert(QEMU_IS_ALIGNED(size, getpagesize()));
trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
qemu_mutex_lock(&s->lock);
mapping = qemu_vfio_find_mapping(s, host, &index);
if (mapping) {
iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
} else {
if (s->high_water_mark - s->low_water_mark + 1 < size) {
ret = -ENOMEM;
goto out;
}
if (!temporary) {
iova0 = s->low_water_mark;
mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
if (!mapping) {
ret = -ENOMEM;
goto out;
}
assert(qemu_vfio_verify_mappings(s));
ret = qemu_vfio_do_mapping(s, host, size, iova0);
if (ret) {
qemu_vfio_undo_mapping(s, mapping, NULL);
goto out;
}
s->low_water_mark += size;
qemu_vfio_dump_mappings(s);
} else {
iova0 = s->high_water_mark - size;
ret = qemu_vfio_do_mapping(s, host, size, iova0);
if (ret) {
goto out;
}
s->high_water_mark -= size;
}
}
if (iova) {
*iova = iova0;
}
out:
qemu_mutex_unlock(&s->lock);
return ret;
}
/* Reset the high watermark and free all "temporary" mappings. */
int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
{
struct vfio_iommu_type1_dma_unmap unmap = {
.argsz = sizeof(unmap),
.flags = 0,
.iova = s->high_water_mark,
.size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
};
trace_qemu_vfio_dma_reset_temporary(s);
qemu_mutex_lock(&s->lock);
if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
error_report("VFIO_UNMAP_DMA: %d", -errno);
qemu_mutex_unlock(&s->lock);
return -errno;
}
s->high_water_mark = QEMU_VFIO_IOVA_MAX;
qemu_mutex_unlock(&s->lock);
return 0;
}
/* Unmapping the whole area that was previously mapped with
* qemu_vfio_dma_map(). */
void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
{
int index = 0;
IOVAMapping *m;
if (!host) {
return;
}
trace_qemu_vfio_dma_unmap(s, host);
qemu_mutex_lock(&s->lock);
m = qemu_vfio_find_mapping(s, host, &index);
if (!m) {
goto out;
}
qemu_vfio_undo_mapping(s, m, NULL);
out:
qemu_mutex_unlock(&s->lock);
}
static void qemu_vfio_reset(QEMUVFIOState *s)
{
ioctl(s->device, VFIO_DEVICE_RESET);
}
/* Close and free the VFIO resources. */
void qemu_vfio_close(QEMUVFIOState *s)
{
int i;
if (!s) {
return;
}
for (i = 0; i < s->nr_mappings; ++i) {
qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
}
ram_block_notifier_remove(&s->ram_notifier);
qemu_vfio_reset(s);
close(s->device);
close(s->group);
close(s->container);
}