hw/nvme: reimplement dsm to allow cancellation

Prior to this patch, a loop was used to issue multiple "fire and forget"
aios for each range in the command. Without a reference to the aiocb
returned from the blk_aio_pdiscard calls, the aios cannot be canceled.

Fix this by processing the ranges one after another.

As a bonus, this fixes how metadata is cleared (i.e. we only zero it out
if the data was succesfully discarded).

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
This commit is contained in:
Klaus Jensen 2021-06-17 21:06:49 +02:00
parent ff0ac2c8b8
commit d7d1474fd8
2 changed files with 162 additions and 75 deletions

View File

@ -2015,26 +2015,6 @@ out:
nvme_verify_cb(ctx, ret); nvme_verify_cb(ctx, ret);
} }
static void nvme_aio_discard_cb(void *opaque, int ret)
{
NvmeRequest *req = opaque;
uintptr_t *discards = (uintptr_t *)&req->opaque;
trace_pci_nvme_aio_discard_cb(nvme_cid(req));
if (ret) {
nvme_aio_err(req, ret);
}
(*discards)--;
if (*discards) {
return;
}
nvme_enqueue_req_completion(nvme_cq(req), req);
}
struct nvme_zone_reset_ctx { struct nvme_zone_reset_ctx {
NvmeRequest *req; NvmeRequest *req;
NvmeZone *zone; NvmeZone *zone;
@ -2495,75 +2475,182 @@ out:
nvme_enqueue_req_completion(nvme_cq(req), req); nvme_enqueue_req_completion(nvme_cq(req), req);
} }
typedef struct NvmeDSMAIOCB {
BlockAIOCB common;
BlockAIOCB *aiocb;
NvmeRequest *req;
QEMUBH *bh;
int ret;
NvmeDsmRange *range;
unsigned int nr;
unsigned int idx;
} NvmeDSMAIOCB;
static void nvme_dsm_cancel(BlockAIOCB *aiocb)
{
NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
/* break nvme_dsm_cb loop */
iocb->idx = iocb->nr;
iocb->ret = -ECANCELED;
if (iocb->aiocb) {
blk_aio_cancel_async(iocb->aiocb);
iocb->aiocb = NULL;
} else {
/*
* We only reach this if nvme_dsm_cancel() has already been called or
* the command ran to completion and nvme_dsm_bh is scheduled to run.
*/
assert(iocb->idx == iocb->nr);
}
}
static const AIOCBInfo nvme_dsm_aiocb_info = {
.aiocb_size = sizeof(NvmeDSMAIOCB),
.cancel_async = nvme_dsm_cancel,
};
static void nvme_dsm_bh(void *opaque)
{
NvmeDSMAIOCB *iocb = opaque;
iocb->common.cb(iocb->common.opaque, iocb->ret);
qemu_bh_delete(iocb->bh);
iocb->bh = NULL;
qemu_aio_unref(iocb);
}
static void nvme_dsm_cb(void *opaque, int ret);
static void nvme_dsm_md_cb(void *opaque, int ret)
{
NvmeDSMAIOCB *iocb = opaque;
NvmeRequest *req = iocb->req;
NvmeNamespace *ns = req->ns;
NvmeDsmRange *range;
uint64_t slba;
uint32_t nlb;
if (ret < 0) {
iocb->ret = ret;
goto done;
}
if (!ns->lbaf.ms) {
nvme_dsm_cb(iocb, 0);
return;
}
range = &iocb->range[iocb->idx - 1];
slba = le64_to_cpu(range->slba);
nlb = le32_to_cpu(range->nlb);
/*
* Check that all block were discarded (zeroed); otherwise we do not zero
* the metadata.
*/
ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
if (ret) {
if (ret < 0) {
iocb->ret = ret;
goto done;
}
nvme_dsm_cb(iocb, 0);
}
iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
nvme_dsm_cb, iocb);
return;
done:
iocb->aiocb = NULL;
qemu_bh_schedule(iocb->bh);
}
static void nvme_dsm_cb(void *opaque, int ret)
{
NvmeDSMAIOCB *iocb = opaque;
NvmeRequest *req = iocb->req;
NvmeCtrl *n = nvme_ctrl(req);
NvmeNamespace *ns = req->ns;
NvmeDsmRange *range;
uint64_t slba;
uint32_t nlb;
if (ret < 0) {
iocb->ret = ret;
goto done;
}
next:
if (iocb->idx == iocb->nr) {
goto done;
}
range = &iocb->range[iocb->idx++];
slba = le64_to_cpu(range->slba);
nlb = le32_to_cpu(range->nlb);
trace_pci_nvme_dsm_deallocate(slba, nlb);
if (nlb > n->dmrsl) {
trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
goto next;
}
if (nvme_check_bounds(ns, slba, nlb)) {
trace_pci_nvme_err_invalid_lba_range(slba, nlb,
ns->id_ns.nsze);
goto next;
}
iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
nvme_l2b(ns, nlb),
nvme_dsm_md_cb, iocb);
return;
done:
iocb->aiocb = NULL;
qemu_bh_schedule(iocb->bh);
}
static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
{ {
NvmeNamespace *ns = req->ns; NvmeNamespace *ns = req->ns;
NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
uint32_t attr = le32_to_cpu(dsm->attributes); uint32_t attr = le32_to_cpu(dsm->attributes);
uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
uint16_t status = NVME_SUCCESS; uint16_t status = NVME_SUCCESS;
trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr); trace_pci_nvme_dsm(nr, attr);
if (attr & NVME_DSMGMT_AD) { if (attr & NVME_DSMGMT_AD) {
int64_t offset; NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
size_t len; nvme_misc_cb, req);
NvmeDsmRange range[nr];
uintptr_t *discards = (uintptr_t *)&req->opaque;
status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req); iocb->req = req;
iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
iocb->ret = 0;
iocb->range = g_new(NvmeDsmRange, nr);
iocb->nr = nr;
iocb->idx = 0;
status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
req);
if (status) { if (status) {
return status; return status;
} }
/* req->aiocb = &iocb->common;
* AIO callbacks may be called immediately, so initialize discards to 1 nvme_dsm_cb(iocb, 0);
* to make sure the the callback does not complete the request before
* all discards have been issued.
*/
*discards = 1;
for (int i = 0; i < nr; i++) { return NVME_NO_COMPLETE;
uint64_t slba = le64_to_cpu(range[i].slba);
uint32_t nlb = le32_to_cpu(range[i].nlb);
if (nvme_check_bounds(ns, slba, nlb)) {
continue;
}
trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
nlb);
if (nlb > n->dmrsl) {
trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
}
offset = nvme_l2b(ns, slba);
len = nvme_l2b(ns, nlb);
while (len) {
size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
(*discards)++;
blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
nvme_aio_discard_cb, req);
offset += bytes;
len -= bytes;
}
}
/* account for the 1-initialization */
(*discards)--;
if (*discards) {
status = NVME_NO_COMPLETE;
} else {
status = req->status;
}
} }
return status; return status;

View File

@ -37,8 +37,8 @@ pci_nvme_verify_mdata_in_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" bl
pci_nvme_verify_cb(uint16_t cid, uint8_t prinfo, uint16_t apptag, uint16_t appmask, uint32_t reftag) "cid %"PRIu16" prinfo 0x%"PRIx8" apptag 0x%"PRIx16" appmask 0x%"PRIx16" reftag 0x%"PRIx32"" pci_nvme_verify_cb(uint16_t cid, uint8_t prinfo, uint16_t apptag, uint16_t appmask, uint32_t reftag) "cid %"PRIu16" prinfo 0x%"PRIx8" apptag 0x%"PRIx16" appmask 0x%"PRIx16" reftag 0x%"PRIx32""
pci_nvme_rw_complete_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'" pci_nvme_rw_complete_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d" pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32"" pci_nvme_dsm(uint32_t nr, uint32_t attr) "nr %"PRIu32" attr 0x%"PRIx32""
pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" pci_nvme_dsm_deallocate(uint64_t slba, uint32_t nlb) "slba %"PRIu64" nlb %"PRIu32""
pci_nvme_dsm_single_range_limit_exceeded(uint32_t nlb, uint32_t dmrsl) "nlb %"PRIu32" dmrsl %"PRIu32"" pci_nvme_dsm_single_range_limit_exceeded(uint32_t nlb, uint32_t dmrsl) "nlb %"PRIu32" dmrsl %"PRIu32""
pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32"" pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
pci_nvme_compare_data_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_compare_data_cb(uint16_t cid) "cid %"PRIu16""