hw/block/nvme: add support for scatter gather lists

For now, support the Data Block, Segment and Last Segment descriptor
types.

See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
This commit is contained in:
Klaus Jensen 2019-04-12 20:53:16 +02:00
parent 6e0ac3a03f
commit cba0a8a344
3 changed files with 279 additions and 60 deletions

View File

@ -413,13 +413,262 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
return NVME_SUCCESS;
}
static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
uint64_t prp1, uint64_t prp2, DMADirection dir,
/*
* Map 'nsgld' data descriptors from 'segment'. The function will subtract the
* number of bytes mapped in len.
*/
static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
QEMUIOVector *iov,
NvmeSglDescriptor *segment, uint64_t nsgld,
size_t *len, NvmeRequest *req)
{
dma_addr_t addr, trans_len;
uint32_t dlen;
uint16_t status;
for (int i = 0; i < nsgld; i++) {
uint8_t type = NVME_SGL_TYPE(segment[i].type);
switch (type) {
case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
break;
case NVME_SGL_DESCR_TYPE_SEGMENT:
case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
default:
return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
}
dlen = le32_to_cpu(segment[i].len);
if (!dlen) {
continue;
}
if (*len == 0) {
/*
* All data has been mapped, but the SGL contains additional
* segments and/or descriptors. The controller might accept
* ignoring the rest of the SGL.
*/
uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
break;
}
trace_pci_nvme_err_invalid_sgl_excess_length(nvme_cid(req));
return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
}
trans_len = MIN(*len, dlen);
addr = le64_to_cpu(segment[i].addr);
if (UINT64_MAX - addr < dlen) {
return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
}
status = nvme_map_addr(n, qsg, iov, addr, trans_len);
if (status) {
return status;
}
*len -= trans_len;
}
return NVME_SUCCESS;
}
static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
NvmeSglDescriptor sgl, size_t len,
NvmeRequest *req)
{
/*
* Read the segment in chunks of 256 descriptors (one 4k page) to avoid
* dynamically allocating a potentially huge SGL. The spec allows the SGL
* to be larger (as in number of bytes required to describe the SGL
* descriptors and segment chain) than the command transfer size, so it is
* not bounded by MDTS.
*/
const int SEG_CHUNK_SIZE = 256;
NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
uint64_t nsgld;
uint32_t seg_len;
uint16_t status;
bool sgl_in_cmb = false;
hwaddr addr;
int ret;
sgld = &sgl;
addr = le64_to_cpu(sgl.addr);
trace_pci_nvme_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), len);
/*
* If the entire transfer can be described with a single data block it can
* be mapped directly.
*/
if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req);
if (status) {
goto unmap;
}
goto out;
}
/*
* If the segment is located in the CMB, the submission queue of the
* request must also reside there.
*/
if (nvme_addr_is_cmb(n, addr)) {
if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) {
return NVME_INVALID_USE_OF_CMB | NVME_DNR;
}
sgl_in_cmb = true;
}
for (;;) {
switch (NVME_SGL_TYPE(sgld->type)) {
case NVME_SGL_DESCR_TYPE_SEGMENT:
case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
break;
default:
return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
}
seg_len = le32_to_cpu(sgld->len);
/* check the length of the (Last) Segment descriptor */
if (!seg_len || seg_len & 0xf) {
return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
}
if (UINT64_MAX - addr < seg_len) {
return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
}
nsgld = seg_len / sizeof(NvmeSglDescriptor);
while (nsgld > SEG_CHUNK_SIZE) {
if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
trace_pci_nvme_err_addr_read(addr);
status = NVME_DATA_TRAS_ERROR;
goto unmap;
}
status = nvme_map_sgl_data(n, qsg, iov, segment, SEG_CHUNK_SIZE,
&len, req);
if (status) {
goto unmap;
}
nsgld -= SEG_CHUNK_SIZE;
addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
}
ret = nvme_addr_read(n, addr, segment, nsgld *
sizeof(NvmeSglDescriptor));
if (ret) {
trace_pci_nvme_err_addr_read(addr);
status = NVME_DATA_TRAS_ERROR;
goto unmap;
}
last_sgld = &segment[nsgld - 1];
/* if the segment ends with a Data Block, then we are done */
if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req);
if (status) {
goto unmap;
}
goto out;
}
/*
* If the last descriptor was not a Data Block, then the current
* segment must not be a Last Segment.
*/
if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
goto unmap;
}
sgld = last_sgld;
addr = le64_to_cpu(sgld->addr);
/*
* Do not map the last descriptor; it will be a Segment or Last Segment
* descriptor and is handled by the next iteration.
*/
status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld - 1, &len, req);
if (status) {
goto unmap;
}
/*
* If the next segment is in the CMB, make sure that the sgl was
* already located there.
*/
if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) {
status = NVME_INVALID_USE_OF_CMB | NVME_DNR;
goto unmap;
}
}
out:
/* if there is any residual left in len, the SGL was too short */
if (len) {
status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
goto unmap;
}
return NVME_SUCCESS;
unmap:
if (iov->iov) {
qemu_iovec_destroy(iov);
}
if (qsg->sg) {
qemu_sglist_destroy(qsg);
}
return status;
}
static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req)
{
uint64_t prp1, prp2;
switch (NVME_CMD_FLAGS_PSDT(req->cmd.flags)) {
case NVME_PSDT_PRP:
prp1 = le64_to_cpu(req->cmd.dptr.prp1);
prp2 = le64_to_cpu(req->cmd.dptr.prp2);
return nvme_map_prp(n, prp1, prp2, len, req);
case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
case NVME_PSDT_SGL_MPTR_SGL:
/* SGLs shall not be used for Admin commands in NVMe over PCIe */
if (!req->sq->sqid) {
return NVME_INVALID_FIELD | NVME_DNR;
}
return nvme_map_sgl(n, &req->qsg, &req->iov, req->cmd.dptr.sgl, len,
req);
default:
return NVME_INVALID_FIELD;
}
}
static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
DMADirection dir, NvmeRequest *req)
{
uint16_t status = NVME_SUCCESS;
status = nvme_map_prp(n, prp1, prp2, len, req);
status = nvme_map_dptr(n, len, req);
if (status) {
return status;
}
@ -458,15 +707,6 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
return status;
}
static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req)
{
NvmeCmd *cmd = &req->cmd;
uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
return nvme_map_prp(n, prp1, prp2, len, req);
}
static void nvme_post_cqes(void *opaque)
{
NvmeCQueue *cq = opaque;
@ -929,10 +1169,7 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
uint64_t off, NvmeRequest *req)
{
NvmeCmd *cmd = &req->cmd;
uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
uint32_t nsid = le32_to_cpu(cmd->nsid);
uint32_t nsid = le32_to_cpu(req->cmd.nsid);
uint32_t trans_len;
time_t current_ms;
@ -981,17 +1218,14 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
nvme_clear_events(n, NVME_AER_TYPE_SMART);
}
return nvme_dma_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2,
DMA_DIRECTION_FROM_DEVICE, req);
return nvme_dma(n, (uint8_t *) &smart + off, trans_len,
DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
NvmeRequest *req)
{
uint32_t trans_len;
NvmeCmd *cmd = &req->cmd;
uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
NvmeFwSlotInfoLog fw_log = {
.afi = 0x1,
};
@ -1004,17 +1238,14 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
trans_len = MIN(sizeof(fw_log) - off, buf_len);
return nvme_dma_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, prp2,
DMA_DIRECTION_FROM_DEVICE, req);
return nvme_dma(n, (uint8_t *) &fw_log + off, trans_len,
DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
uint64_t off, NvmeRequest *req)
{
uint32_t trans_len;
NvmeCmd *cmd = &req->cmd;
uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
NvmeErrorLog errlog;
if (!rae) {
@ -1029,8 +1260,8 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
trans_len = MIN(sizeof(errlog) - off, buf_len);
return nvme_dma_prp(n, (uint8_t *)&errlog, trans_len, prp1, prp2,
DMA_DIRECTION_FROM_DEVICE, req);
return nvme_dma(n, (uint8_t *)&errlog, trans_len,
DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
@ -1190,14 +1421,10 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
{
NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
uint64_t prp1 = le64_to_cpu(c->prp1);
uint64_t prp2 = le64_to_cpu(c->prp2);
trace_pci_nvme_identify_ctrl();
return nvme_dma_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), prp1,
prp2, DMA_DIRECTION_FROM_DEVICE, req);
return nvme_dma(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
@ -1205,8 +1432,6 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
NvmeNamespace *ns;
NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
uint32_t nsid = le32_to_cpu(c->nsid);
uint64_t prp1 = le64_to_cpu(c->prp1);
uint64_t prp2 = le64_to_cpu(c->prp2);
trace_pci_nvme_identify_ns(nsid);
@ -1217,8 +1442,8 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
ns = &n->namespaces[nsid - 1];
return nvme_dma_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), prp1,
prp2, DMA_DIRECTION_FROM_DEVICE, req);
return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
@ -1226,8 +1451,6 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
static const int data_len = NVME_IDENTIFY_DATA_SIZE;
uint32_t min_nsid = le32_to_cpu(c->nsid);
uint64_t prp1 = le64_to_cpu(c->prp1);
uint64_t prp2 = le64_to_cpu(c->prp2);
uint32_t *list;
uint16_t ret;
int i, j = 0;
@ -1254,8 +1477,8 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
break;
}
}
ret = nvme_dma_prp(n, (uint8_t *)list, data_len, prp1, prp2,
DMA_DIRECTION_FROM_DEVICE, req);
ret = nvme_dma(n, (uint8_t *)list, data_len, DMA_DIRECTION_FROM_DEVICE,
req);
g_free(list);
return ret;
}
@ -1264,8 +1487,6 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
{
NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
uint32_t nsid = le32_to_cpu(c->nsid);
uint64_t prp1 = le64_to_cpu(c->prp1);
uint64_t prp2 = le64_to_cpu(c->prp2);
uint8_t list[NVME_IDENTIFY_DATA_SIZE];
@ -1297,8 +1518,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN;
stl_be_p(&ns_descrs->uuid.v, nsid);
return nvme_dma_prp(n, list, NVME_IDENTIFY_DATA_SIZE, prp1, prp2,
DMA_DIRECTION_FROM_DEVICE, req);
return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE,
DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
@ -1369,14 +1590,10 @@ static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
{
NvmeCmd *cmd = &req->cmd;
uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
uint64_t timestamp = nvme_get_timestamp(n);
return nvme_dma_prp(n, (uint8_t *)&timestamp, sizeof(timestamp), prp1,
prp2, DMA_DIRECTION_FROM_DEVICE, req);
return nvme_dma(n, (uint8_t *)&timestamp, sizeof(timestamp),
DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
@ -1505,12 +1722,9 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
{
uint16_t ret;
uint64_t timestamp;
NvmeCmd *cmd = &req->cmd;
uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
ret = nvme_dma_prp(n, (uint8_t *)&timestamp, sizeof(timestamp), prp1,
prp2, DMA_DIRECTION_TO_DEVICE, req);
ret = nvme_dma(n, (uint8_t *)&timestamp, sizeof(timestamp),
DMA_DIRECTION_TO_DEVICE, req);
if (ret != NVME_SUCCESS) {
return ret;
}
@ -2437,6 +2651,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
id->nn = cpu_to_le32(n->num_namespaces);
id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
NVME_ONCS_FEATURES);
id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN);
subnqn = g_strdup_printf("nqn.2019-08.org.qemu:%s", n->params.serial);
strpadcpy((char *)id->subnqn, sizeof(id->subnqn), subnqn, '\0');

View File

@ -36,6 +36,7 @@ pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2
pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d"
pci_nvme_map_sgl(uint16_t cid, uint8_t typ, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" len %"PRIu64""
pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" '%s' nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
@ -91,6 +92,9 @@ pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64""
pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64""
pci_nvme_err_cfs(void) "controller fatal status"
pci_nvme_err_aio(uint16_t cid, const char *errname, uint16_t status) "cid %"PRIu16" err '%s' status 0x%"PRIx16""
pci_nvme_err_invalid_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8""
pci_nvme_err_invalid_num_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8""
pci_nvme_err_invalid_sgl_excess_length(uint16_t cid) "cid %"PRIu16""
pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""

View File

@ -412,9 +412,9 @@ typedef union NvmeCmdDptr {
} NvmeCmdDptr;
enum NvmePsdt {
PSDT_PRP = 0x0,
PSDT_SGL_MPTR_CONTIGUOUS = 0x1,
PSDT_SGL_MPTR_SGL = 0x2,
NVME_PSDT_PRP = 0x0,
NVME_PSDT_SGL_MPTR_CONTIGUOUS = 0x1,
NVME_PSDT_SGL_MPTR_SGL = 0x2,
};
typedef struct QEMU_PACKED NvmeCmd {