qemu-e2k/hw/block/nvme.c
Markus Armbruster 02c4f26b15 block: Use g_new() & friends to avoid multiplying sizes
g_new(T, n) is safer than g_malloc(sizeof(*v) * n) for two reasons.
One, it catches multiplication overflowing size_t.  Two, it returns
T * rather than void *, which lets the compiler catch more type
errors.

Perhaps a conversion to g_malloc_n() would be neater in places, but
that's merely four years old, and we can't use such newfangled stuff.

This commit only touches allocations with size arguments of the form
sizeof(T), plus two that use 4 instead of sizeof(uint32_t).  We can
make the others safe by converting to g_malloc_n() when it becomes
available to us in a couple of years.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2014-08-20 11:51:28 +02:00

887 lines
24 KiB
C

/*
* QEMU NVM Express Controller
*
* Copyright (c) 2012, Intel Corporation
*
* Written by Keith Busch <keith.busch@intel.com>
*
* This code is licensed under the GNU GPL v2 or later.
*/
/**
* Reference Specs: http://www.nvmexpress.org, 1.1, 1.0e
*
* http://www.nvmexpress.org/resources/
*/
/**
* Usage: add options:
* -drive file=<file>,if=none,id=<drive_id>
* -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>
*/
#include <hw/block/block.h>
#include <hw/hw.h>
#include <hw/pci/msix.h>
#include <hw/pci/pci.h>
#include "nvme.h"
static void nvme_process_sq(void *opaque);
static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
{
return sqid < n->num_queues && n->sq[sqid] != NULL ? 0 : -1;
}
static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
{
return cqid < n->num_queues && n->cq[cqid] != NULL ? 0 : -1;
}
static void nvme_inc_cq_tail(NvmeCQueue *cq)
{
cq->tail++;
if (cq->tail >= cq->size) {
cq->tail = 0;
cq->phase = !cq->phase;
}
}
static void nvme_inc_sq_head(NvmeSQueue *sq)
{
sq->head = (sq->head + 1) % sq->size;
}
static uint8_t nvme_cq_full(NvmeCQueue *cq)
{
return (cq->tail + 1) % cq->size == cq->head;
}
static uint8_t nvme_sq_empty(NvmeSQueue *sq)
{
return sq->head == sq->tail;
}
static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
{
if (cq->irq_enabled) {
if (msix_enabled(&(n->parent_obj))) {
msix_notify(&(n->parent_obj), cq->vector);
} else {
pci_irq_pulse(&n->parent_obj);
}
}
}
static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
uint32_t len, NvmeCtrl *n)
{
hwaddr trans_len = n->page_size - (prp1 % n->page_size);
trans_len = MIN(len, trans_len);
int num_prps = (len >> n->page_bits) + 1;
if (!prp1) {
return NVME_INVALID_FIELD | NVME_DNR;
}
pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
qemu_sglist_add(qsg, prp1, trans_len);
len -= trans_len;
if (len) {
if (!prp2) {
goto unmap;
}
if (len > n->page_size) {
uint64_t prp_list[n->max_prp_ents];
uint32_t nents, prp_trans;
int i = 0;
nents = (len + n->page_size - 1) >> n->page_bits;
prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
pci_dma_read(&n->parent_obj, prp2, (void *)prp_list, prp_trans);
while (len != 0) {
uint64_t prp_ent = le64_to_cpu(prp_list[i]);
if (i == n->max_prp_ents - 1 && len > n->page_size) {
if (!prp_ent || prp_ent & (n->page_size - 1)) {
goto unmap;
}
i = 0;
nents = (len + n->page_size - 1) >> n->page_bits;
prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
pci_dma_read(&n->parent_obj, prp_ent, (void *)prp_list,
prp_trans);
prp_ent = le64_to_cpu(prp_list[i]);
}
if (!prp_ent || prp_ent & (n->page_size - 1)) {
goto unmap;
}
trans_len = MIN(len, n->page_size);
qemu_sglist_add(qsg, prp_ent, trans_len);
len -= trans_len;
i++;
}
} else {
if (prp2 & (n->page_size - 1)) {
goto unmap;
}
qemu_sglist_add(qsg, prp2, len);
}
}
return NVME_SUCCESS;
unmap:
qemu_sglist_destroy(qsg);
return NVME_INVALID_FIELD | NVME_DNR;
}
static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
uint64_t prp1, uint64_t prp2)
{
QEMUSGList qsg;
if (nvme_map_prp(&qsg, prp1, prp2, len, n)) {
return NVME_INVALID_FIELD | NVME_DNR;
}
if (dma_buf_read(ptr, len, &qsg)) {
qemu_sglist_destroy(&qsg);
return NVME_INVALID_FIELD | NVME_DNR;
}
return NVME_SUCCESS;
}
static void nvme_post_cqes(void *opaque)
{
NvmeCQueue *cq = opaque;
NvmeCtrl *n = cq->ctrl;
NvmeRequest *req, *next;
QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
NvmeSQueue *sq;
hwaddr addr;
if (nvme_cq_full(cq)) {
break;
}
QTAILQ_REMOVE(&cq->req_list, req, entry);
sq = req->sq;
req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
req->cqe.sq_id = cpu_to_le16(sq->sqid);
req->cqe.sq_head = cpu_to_le16(sq->head);
addr = cq->dma_addr + cq->tail * n->cqe_size;
nvme_inc_cq_tail(cq);
pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
sizeof(req->cqe));
QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
}
nvme_isr_notify(n, cq);
}
static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
{
assert(cq->cqid == req->sq->cqid);
QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
static void nvme_rw_cb(void *opaque, int ret)
{
NvmeRequest *req = opaque;
NvmeSQueue *sq = req->sq;
NvmeCtrl *n = sq->ctrl;
NvmeCQueue *cq = n->cq[sq->cqid];
bdrv_acct_done(n->conf.bs, &req->acct);
if (!ret) {
req->status = NVME_SUCCESS;
} else {
req->status = NVME_INTERNAL_DEV_ERROR;
}
qemu_sglist_destroy(&req->qsg);
nvme_enqueue_req_completion(cq, req);
}
static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
NvmeRequest *req)
{
NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
uint32_t nlb = le32_to_cpu(rw->nlb) + 1;
uint64_t slba = le64_to_cpu(rw->slba);
uint64_t prp1 = le64_to_cpu(rw->prp1);
uint64_t prp2 = le64_to_cpu(rw->prp2);
uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
uint64_t data_size = nlb << data_shift;
uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
if ((slba + nlb) > ns->id_ns.nsze) {
return NVME_LBA_RANGE | NVME_DNR;
}
if (nvme_map_prp(&req->qsg, prp1, prp2, data_size, n)) {
return NVME_INVALID_FIELD | NVME_DNR;
}
assert((nlb << data_shift) == req->qsg.size);
dma_acct_start(n->conf.bs, &req->acct, &req->qsg, is_write ?
BDRV_ACCT_WRITE : BDRV_ACCT_READ);
req->aiocb = is_write ?
dma_bdrv_write(n->conf.bs, &req->qsg, aio_slba, nvme_rw_cb, req) :
dma_bdrv_read(n->conf.bs, &req->qsg, aio_slba, nvme_rw_cb, req);
return NVME_NO_COMPLETE;
}
static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
{
NvmeNamespace *ns;
uint32_t nsid = le32_to_cpu(cmd->nsid);
if (nsid == 0 || nsid > n->num_namespaces) {
return NVME_INVALID_NSID | NVME_DNR;
}
ns = &n->namespaces[nsid - 1];
switch (cmd->opcode) {
case NVME_CMD_FLUSH:
return NVME_SUCCESS;
case NVME_CMD_WRITE:
case NVME_CMD_READ:
return nvme_rw(n, ns, cmd, req);
default:
return NVME_INVALID_OPCODE | NVME_DNR;
}
}
static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
{
n->sq[sq->sqid] = NULL;
timer_del(sq->timer);
timer_free(sq->timer);
g_free(sq->io_req);
if (sq->sqid) {
g_free(sq);
}
}
static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
{
NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
NvmeRequest *req, *next;
NvmeSQueue *sq;
NvmeCQueue *cq;
uint16_t qid = le16_to_cpu(c->qid);
if (!qid || nvme_check_sqid(n, qid)) {
return NVME_INVALID_QID | NVME_DNR;
}
sq = n->sq[qid];
while (!QTAILQ_EMPTY(&sq->out_req_list)) {
req = QTAILQ_FIRST(&sq->out_req_list);
assert(req->aiocb);
bdrv_aio_cancel(req->aiocb);
}
if (!nvme_check_cqid(n, sq->cqid)) {
cq = n->cq[sq->cqid];
QTAILQ_REMOVE(&cq->sq_list, sq, entry);
nvme_post_cqes(cq);
QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
if (req->sq == sq) {
QTAILQ_REMOVE(&cq->req_list, req, entry);
QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
}
}
}
nvme_free_sq(sq, n);
return NVME_SUCCESS;
}
static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
uint16_t sqid, uint16_t cqid, uint16_t size)
{
int i;
NvmeCQueue *cq;
sq->ctrl = n;
sq->dma_addr = dma_addr;
sq->sqid = sqid;
sq->size = size;
sq->cqid = cqid;
sq->head = sq->tail = 0;
sq->io_req = g_new(NvmeRequest, sq->size);
QTAILQ_INIT(&sq->req_list);
QTAILQ_INIT(&sq->out_req_list);
for (i = 0; i < sq->size; i++) {
sq->io_req[i].sq = sq;
QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
}
sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
assert(n->cq[cqid]);
cq = n->cq[cqid];
QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
n->sq[sqid] = sq;
}
static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
{
NvmeSQueue *sq;
NvmeCreateSq *c = (NvmeCreateSq *)cmd;
uint16_t cqid = le16_to_cpu(c->cqid);
uint16_t sqid = le16_to_cpu(c->sqid);
uint16_t qsize = le16_to_cpu(c->qsize);
uint16_t qflags = le16_to_cpu(c->sq_flags);
uint64_t prp1 = le64_to_cpu(c->prp1);
if (!cqid || nvme_check_cqid(n, cqid)) {
return NVME_INVALID_CQID | NVME_DNR;
}
if (!sqid || (sqid && !nvme_check_sqid(n, sqid))) {
return NVME_INVALID_QID | NVME_DNR;
}
if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
}
if (!prp1 || prp1 & (n->page_size - 1)) {
return NVME_INVALID_FIELD | NVME_DNR;
}
if (!(NVME_SQ_FLAGS_PC(qflags))) {
return NVME_INVALID_FIELD | NVME_DNR;
}
sq = g_malloc0(sizeof(*sq));
nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
return NVME_SUCCESS;
}
static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
{
n->cq[cq->cqid] = NULL;
timer_del(cq->timer);
timer_free(cq->timer);
msix_vector_unuse(&n->parent_obj, cq->vector);
if (cq->cqid) {
g_free(cq);
}
}
static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
{
NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
NvmeCQueue *cq;
uint16_t qid = le16_to_cpu(c->qid);
if (!qid || nvme_check_cqid(n, qid)) {
return NVME_INVALID_CQID | NVME_DNR;
}
cq = n->cq[qid];
if (!QTAILQ_EMPTY(&cq->sq_list)) {
return NVME_INVALID_QUEUE_DEL;
}
nvme_free_cq(cq, n);
return NVME_SUCCESS;
}
static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
uint16_t cqid, uint16_t vector, uint16_t size, uint16_t irq_enabled)
{
cq->ctrl = n;
cq->cqid = cqid;
cq->size = size;
cq->dma_addr = dma_addr;
cq->phase = 1;
cq->irq_enabled = irq_enabled;
cq->vector = vector;
cq->head = cq->tail = 0;
QTAILQ_INIT(&cq->req_list);
QTAILQ_INIT(&cq->sq_list);
msix_vector_use(&n->parent_obj, cq->vector);
n->cq[cqid] = cq;
cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
}
static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
{
NvmeCQueue *cq;
NvmeCreateCq *c = (NvmeCreateCq *)cmd;
uint16_t cqid = le16_to_cpu(c->cqid);
uint16_t vector = le16_to_cpu(c->irq_vector);
uint16_t qsize = le16_to_cpu(c->qsize);
uint16_t qflags = le16_to_cpu(c->cq_flags);
uint64_t prp1 = le64_to_cpu(c->prp1);
if (!cqid || (cqid && !nvme_check_cqid(n, cqid))) {
return NVME_INVALID_CQID | NVME_DNR;
}
if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
}
if (!prp1) {
return NVME_INVALID_FIELD | NVME_DNR;
}
if (vector > n->num_queues) {
return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
}
if (!(NVME_CQ_FLAGS_PC(qflags))) {
return NVME_INVALID_FIELD | NVME_DNR;
}
cq = g_malloc0(sizeof(*cq));
nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
NVME_CQ_FLAGS_IEN(qflags));
return NVME_SUCCESS;
}
static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
{
NvmeNamespace *ns;
NvmeIdentify *c = (NvmeIdentify *)cmd;
uint32_t cns = le32_to_cpu(c->cns);
uint32_t nsid = le32_to_cpu(c->nsid);
uint64_t prp1 = le64_to_cpu(c->prp1);
uint64_t prp2 = le64_to_cpu(c->prp2);
if (cns) {
return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
prp1, prp2);
}
if (nsid == 0 || nsid > n->num_namespaces) {
return NVME_INVALID_NSID | NVME_DNR;
}
ns = &n->namespaces[nsid - 1];
return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
prp1, prp2);
}
static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
{
uint32_t dw10 = le32_to_cpu(cmd->cdw10);
switch (dw10) {
case NVME_NUMBER_OF_QUEUES:
req->cqe.result = cpu_to_le32(n->num_queues);
break;
default:
return NVME_INVALID_FIELD | NVME_DNR;
}
return NVME_SUCCESS;
}
static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
{
uint32_t dw10 = le32_to_cpu(cmd->cdw10);
switch (dw10) {
case NVME_NUMBER_OF_QUEUES:
req->cqe.result = cpu_to_le32(n->num_queues);
break;
default:
return NVME_INVALID_FIELD | NVME_DNR;
}
return NVME_SUCCESS;
}
static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
{
switch (cmd->opcode) {
case NVME_ADM_CMD_DELETE_SQ:
return nvme_del_sq(n, cmd);
case NVME_ADM_CMD_CREATE_SQ:
return nvme_create_sq(n, cmd);
case NVME_ADM_CMD_DELETE_CQ:
return nvme_del_cq(n, cmd);
case NVME_ADM_CMD_CREATE_CQ:
return nvme_create_cq(n, cmd);
case NVME_ADM_CMD_IDENTIFY:
return nvme_identify(n, cmd);
case NVME_ADM_CMD_SET_FEATURES:
return nvme_set_feature(n, cmd, req);
case NVME_ADM_CMD_GET_FEATURES:
return nvme_get_feature(n, cmd, req);
default:
return NVME_INVALID_OPCODE | NVME_DNR;
}
}
static void nvme_process_sq(void *opaque)
{
NvmeSQueue *sq = opaque;
NvmeCtrl *n = sq->ctrl;
NvmeCQueue *cq = n->cq[sq->cqid];
uint16_t status;
hwaddr addr;
NvmeCmd cmd;
NvmeRequest *req;
while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
addr = sq->dma_addr + sq->head * n->sqe_size;
pci_dma_read(&n->parent_obj, addr, (void *)&cmd, sizeof(cmd));
nvme_inc_sq_head(sq);
req = QTAILQ_FIRST(&sq->req_list);
QTAILQ_REMOVE(&sq->req_list, req, entry);
QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
memset(&req->cqe, 0, sizeof(req->cqe));
req->cqe.cid = cmd.cid;
status = sq->sqid ? nvme_io_cmd(n, &cmd, req) :
nvme_admin_cmd(n, &cmd, req);
if (status != NVME_NO_COMPLETE) {
req->status = status;
nvme_enqueue_req_completion(cq, req);
}
}
}
static void nvme_clear_ctrl(NvmeCtrl *n)
{
int i;
for (i = 0; i < n->num_queues; i++) {
if (n->sq[i] != NULL) {
nvme_free_sq(n->sq[i], n);
}
}
for (i = 0; i < n->num_queues; i++) {
if (n->cq[i] != NULL) {
nvme_free_cq(n->cq[i], n);
}
}
bdrv_flush(n->conf.bs);
n->bar.cc = 0;
}
static int nvme_start_ctrl(NvmeCtrl *n)
{
uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
uint32_t page_size = 1 << page_bits;
if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
!NVME_AQA_ASQS(n->bar.aqa) || NVME_AQA_ASQS(n->bar.aqa) > 4095 ||
!NVME_AQA_ACQS(n->bar.aqa) || NVME_AQA_ACQS(n->bar.aqa) > 4095) {
return -1;
}
n->page_bits = page_bits;
n->page_size = page_size;
n->max_prp_ents = n->page_size / sizeof(uint64_t);
n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0,
NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
NVME_AQA_ASQS(n->bar.aqa) + 1);
return 0;
}
static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
unsigned size)
{
switch (offset) {
case 0xc:
n->bar.intms |= data & 0xffffffff;
n->bar.intmc = n->bar.intms;
break;
case 0x10:
n->bar.intms &= ~(data & 0xffffffff);
n->bar.intmc = n->bar.intms;
break;
case 0x14:
if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
n->bar.cc = data;
if (nvme_start_ctrl(n)) {
n->bar.csts = NVME_CSTS_FAILED;
} else {
n->bar.csts = NVME_CSTS_READY;
}
} else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
nvme_clear_ctrl(n);
n->bar.csts &= ~NVME_CSTS_READY;
}
if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
nvme_clear_ctrl(n);
n->bar.cc = data;
n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
} else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
n->bar.cc = data;
}
break;
case 0x24:
n->bar.aqa = data & 0xffffffff;
break;
case 0x28:
n->bar.asq = data;
break;
case 0x2c:
n->bar.asq |= data << 32;
break;
case 0x30:
n->bar.acq = data;
break;
case 0x34:
n->bar.acq |= data << 32;
break;
default:
break;
}
}
static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
{
NvmeCtrl *n = (NvmeCtrl *)opaque;
uint8_t *ptr = (uint8_t *)&n->bar;
uint64_t val = 0;
if (addr < sizeof(n->bar)) {
memcpy(&val, ptr + addr, size);
}
return val;
}
static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
{
uint32_t qid;
if (addr & ((1 << 2) - 1)) {
return;
}
if (((addr - 0x1000) >> 2) & 1) {
uint16_t new_head = val & 0xffff;
int start_sqs;
NvmeCQueue *cq;
qid = (addr - (0x1000 + (1 << 2))) >> 3;
if (nvme_check_cqid(n, qid)) {
return;
}
cq = n->cq[qid];
if (new_head >= cq->size) {
return;
}
start_sqs = nvme_cq_full(cq) ? 1 : 0;
cq->head = new_head;
if (start_sqs) {
NvmeSQueue *sq;
QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
if (cq->tail != cq->head) {
nvme_isr_notify(n, cq);
}
} else {
uint16_t new_tail = val & 0xffff;
NvmeSQueue *sq;
qid = (addr - 0x1000) >> 3;
if (nvme_check_sqid(n, qid)) {
return;
}
sq = n->sq[qid];
if (new_tail >= sq->size) {
return;
}
sq->tail = new_tail;
timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
}
static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
unsigned size)
{
NvmeCtrl *n = (NvmeCtrl *)opaque;
if (addr < sizeof(n->bar)) {
nvme_write_bar(n, addr, data, size);
} else if (addr >= 0x1000) {
nvme_process_db(n, addr, data);
}
}
static const MemoryRegionOps nvme_mmio_ops = {
.read = nvme_mmio_read,
.write = nvme_mmio_write,
.endianness = DEVICE_LITTLE_ENDIAN,
.impl = {
.min_access_size = 2,
.max_access_size = 8,
},
};
static int nvme_init(PCIDevice *pci_dev)
{
NvmeCtrl *n = NVME(pci_dev);
NvmeIdCtrl *id = &n->id_ctrl;
int i;
int64_t bs_size;
uint8_t *pci_conf;
if (!(n->conf.bs)) {
return -1;
}
bs_size = bdrv_getlength(n->conf.bs);
if (bs_size < 0) {
return -1;
}
blkconf_serial(&n->conf, &n->serial);
if (!n->serial) {
return -1;
}
pci_conf = pci_dev->config;
pci_conf[PCI_INTERRUPT_PIN] = 1;
pci_config_set_prog_interface(pci_dev->config, 0x2);
pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
pcie_endpoint_cap_init(&n->parent_obj, 0x80);
n->num_namespaces = 1;
n->num_queues = 64;
n->reg_size = 1 << qemu_fls(0x1004 + 2 * (n->num_queues + 1) * 4);
n->ns_size = bs_size / (uint64_t)n->num_namespaces;
n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
n->sq = g_new0(NvmeSQueue *, n->num_queues);
n->cq = g_new0(NvmeCQueue *, n->num_queues);
memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n,
"nvme", n->reg_size);
pci_register_bar(&n->parent_obj, 0,
PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
&n->iomem);
msix_init_exclusive_bar(&n->parent_obj, n->num_queues, 4);
id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
strpadcpy((char *)id->sn, sizeof(id->sn), n->serial, ' ');
id->rab = 6;
id->ieee[0] = 0x00;
id->ieee[1] = 0x02;
id->ieee[2] = 0xb3;
id->oacs = cpu_to_le16(0);
id->frmw = 7 << 1;
id->lpa = 1 << 0;
id->sqes = (0x6 << 4) | 0x6;
id->cqes = (0x4 << 4) | 0x4;
id->nn = cpu_to_le32(n->num_namespaces);
id->psd[0].mp = cpu_to_le16(0x9c4);
id->psd[0].enlat = cpu_to_le32(0x10);
id->psd[0].exlat = cpu_to_le32(0x4);
n->bar.cap = 0;
NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
NVME_CAP_SET_CQR(n->bar.cap, 1);
NVME_CAP_SET_AMS(n->bar.cap, 1);
NVME_CAP_SET_TO(n->bar.cap, 0xf);
NVME_CAP_SET_CSS(n->bar.cap, 1);
n->bar.vs = 0x00010001;
n->bar.intmc = n->bar.intms = 0;
for (i = 0; i < n->num_namespaces; i++) {
NvmeNamespace *ns = &n->namespaces[i];
NvmeIdNs *id_ns = &ns->id_ns;
id_ns->nsfeat = 0;
id_ns->nlbaf = 0;
id_ns->flbas = 0;
id_ns->mc = 0;
id_ns->dpc = 0;
id_ns->dps = 0;
id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
id_ns->ncap = id_ns->nuse = id_ns->nsze =
cpu_to_le64(n->ns_size >>
id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas)].ds);
}
return 0;
}
static void nvme_exit(PCIDevice *pci_dev)
{
NvmeCtrl *n = NVME(pci_dev);
nvme_clear_ctrl(n);
g_free(n->namespaces);
g_free(n->cq);
g_free(n->sq);
msix_uninit_exclusive_bar(pci_dev);
}
static Property nvme_props[] = {
DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
DEFINE_PROP_END_OF_LIST(),
};
static const VMStateDescription nvme_vmstate = {
.name = "nvme",
.unmigratable = 1,
};
static void nvme_class_init(ObjectClass *oc, void *data)
{
DeviceClass *dc = DEVICE_CLASS(oc);
PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
pc->init = nvme_init;
pc->exit = nvme_exit;
pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
pc->vendor_id = PCI_VENDOR_ID_INTEL;
pc->device_id = 0x5845;
pc->revision = 1;
pc->is_express = 1;
set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
dc->desc = "Non-Volatile Memory Express";
dc->props = nvme_props;
dc->vmsd = &nvme_vmstate;
}
static const TypeInfo nvme_info = {
.name = "nvme",
.parent = TYPE_PCI_DEVICE,
.instance_size = sizeof(NvmeCtrl),
.class_init = nvme_class_init,
};
static void nvme_register_types(void)
{
type_register_static(&nvme_info);
}
type_init(nvme_register_types)