From 736b01642d85be832385063f278fe7cd4ffb5221 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 17 Dec 2021 10:44:01 +0100 Subject: [PATCH 1/6] hw/nvme: fix CVE-2021-3929 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes CVE-2021-3929 "locally" by denying DMA to the iomem of the device itself. This still allows DMA to MMIO regions of other devices (e.g. doing P2P DMA to the controller memory buffer of another NVMe device). Fixes: CVE-2021-3929 Reported-by: Qiuhao Li Reviewed-by: Keith Busch Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 1f62116af9..37681a9759 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -357,6 +357,24 @@ static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr) return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba); } +static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr) +{ + hwaddr hi, lo; + + /* + * The purpose of this check is to guard against invalid "local" access to + * the iomem (i.e. controller registers). Thus, we check against the range + * covered by the 'bar0' MemoryRegion since that is currently composed of + * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however, + * that if the device model is ever changed to allow the CMB to be located + * in BAR0 as well, then this must be changed. + */ + lo = n->bar0.addr; + hi = lo + int128_get64(n->bar0.size); + + return addr >= lo && addr < hi; +} + static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) { hwaddr hi = addr + size - 1; @@ -614,6 +632,10 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len) trace_pci_nvme_map_addr(addr, len); + if (nvme_addr_is_iomem(n, addr)) { + return NVME_DATA_TRAS_ERROR; + } + if (nvme_addr_is_cmb(n, addr)) { cmb = true; } else if (nvme_addr_is_pmr(n, addr)) { From e080ce8676e9097184304a2ed11bf95443c0e547 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 11 Nov 2021 16:45:51 +0100 Subject: [PATCH 2/6] hw/nvme/ctrl: Have nvme_addr_write() take const buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'buf' argument is not modified, so better pass it as const type. Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Klaus Jensen Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 37681a9759..12e1fcda7c 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -395,7 +395,7 @@ static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) return pci_dma_read(&n->parent_obj, addr, buf, size); } -static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size) +static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size) { hwaddr hi = addr + size - 1; if (hi < addr) { From 8d3a17be6f556a996ab9404bead7fc58758c21eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 11 Nov 2021 16:45:52 +0100 Subject: [PATCH 3/6] hw/nvme/ctrl: Pass buffers as 'void *' types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These buffers can be anything, not an array of chars, so use the 'void *' type for them. Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Klaus Jensen Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 10 +++++----- hw/nvme/nvme.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 12e1fcda7c..4344405e59 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1162,7 +1162,7 @@ static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, return NVME_SUCCESS; } -static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len, +static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len, NvmeTxDirection dir) { assert(sg->flags & NVME_SG_ALLOC); @@ -1199,7 +1199,7 @@ static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len, return NVME_SUCCESS; } -static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len, +static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len, NvmeRequest *req) { uint16_t status; @@ -1212,7 +1212,7 @@ static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len, return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE); } -static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len, +static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len, NvmeRequest *req) { uint16_t status; @@ -1225,7 +1225,7 @@ static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len, return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE); } -uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len, +uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len, NvmeTxDirection dir, NvmeRequest *req) { NvmeNamespace *ns = req->ns; @@ -1241,7 +1241,7 @@ uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len, return nvme_tx(n, &req->sg, ptr, len, dir); } -uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len, +uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len, NvmeTxDirection dir, NvmeRequest *req) { NvmeNamespace *ns = req->ns; diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index 83ffabade4..38f3ebf7f6 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -495,9 +495,9 @@ static inline uint16_t nvme_cid(NvmeRequest *req) } void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns); -uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len, +uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len, NvmeTxDirection dir, NvmeRequest *req); -uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len, +uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len, NvmeTxDirection dir, NvmeRequest *req); void nvme_rw_complete_cb(void *opaque, int ret); uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len, From 6190d92ff70c177e901a85fe0c2da44e34c606f9 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 22 Nov 2021 23:22:27 +0100 Subject: [PATCH 4/6] hw/nvme: add struct for zone management send Add struct for Zone Management Send in preparation for more zone send flags. Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 10 ++++------ include/block/nvme.h | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 4344405e59..7cb4974c5e 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -3616,26 +3616,24 @@ done: static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) { - NvmeCmd *cmd = (NvmeCmd *)&req->cmd; + NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd; NvmeNamespace *ns = req->ns; NvmeZone *zone; NvmeZoneResetAIOCB *iocb; uint8_t *zd_ext; - uint32_t dw13 = le32_to_cpu(cmd->cdw13); uint64_t slba = 0; uint32_t zone_idx = 0; uint16_t status; - uint8_t action; + uint8_t action = cmd->zsa; bool all; enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE; - action = dw13 & 0xff; - all = !!(dw13 & 0x100); + all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL; req->status = NVME_SUCCESS; if (!all) { - status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx); + status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx); if (status) { return status; } diff --git a/include/block/nvme.h b/include/block/nvme.h index e3bd47bf76..709d491c70 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1433,6 +1433,21 @@ enum NvmeZoneType { NVME_ZONE_TYPE_SEQ_WRITE = 0x02, }; +typedef struct QEMU_PACKED NvmeZoneSendCmd { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint32_t rsvd8[4]; + NvmeCmdDptr dptr; + uint64_t slba; + uint32_t rsvd48; + uint8_t zsa; + uint8_t zsflags; + uint8_t rsvd54[2]; + uint32_t rsvd56[2]; +} NvmeZoneSendCmd; + enum NvmeZoneSendAction { NVME_ZONE_ACTION_RSD = 0x00, NVME_ZONE_ACTION_CLOSE = 0x01, @@ -1443,6 +1458,10 @@ enum NvmeZoneSendAction { NVME_ZONE_ACTION_SET_ZD_EXT = 0x10, }; +enum { + NVME_ZSFLAG_SELECT_ALL = 1 << 0, +}; + typedef struct QEMU_PACKED NvmeZoneDescr { uint8_t zt; uint8_t zs; From 25872031e14edf6a47bff1c015a026afe5c1c967 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 22 Nov 2021 23:38:31 +0100 Subject: [PATCH 5/6] hw/nvme: add ozcs enum Add enumeration for OZCS values. Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/nvme/ns.c | 3 ++- include/block/nvme.h | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c index 8b5f98c761..356b6c1c2f 100644 --- a/hw/nvme/ns.c +++ b/hw/nvme/ns.c @@ -266,7 +266,8 @@ static void nvme_ns_init_zoned(NvmeNamespace *ns) id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1); id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1); id_ns_z->zoc = 0; - id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00; + id_ns_z->ozcs = ns->params.cross_zone_read ? + NVME_ID_NS_ZONED_OZCS_RAZB : 0x00; for (i = 0; i <= ns->id_ns.nlbaf; i++) { id_ns_z->lbafe[i].zsze = cpu_to_le64(ns->zone_size); diff --git a/include/block/nvme.h b/include/block/nvme.h index 709d491c70..e10ea6f0eb 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1351,6 +1351,10 @@ typedef struct QEMU_PACKED NvmeIdNsZoned { uint8_t vs[256]; } NvmeIdNsZoned; +enum NvmeIdNsZonedOzcs { + NVME_ID_NS_ZONED_OZCS_RAZB = 1 << 0, +}; + /*Deallocate Logical Block Features*/ #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat) ((dlfeat) & 0x10) #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat) ((dlfeat) & 0x08) From e321b4cdc2dd0b5e806ecf759138be7f83774142 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Thu, 4 Mar 2021 08:40:11 +0100 Subject: [PATCH 6/6] hw/nvme: add support for zoned random write area Add support for TP 4076 ("Zoned Random Write Area"), v2021.08.23 ("Ratified"). This adds three new namespace parameters: "zoned.numzrwa" (number of zrwa resources, i.e. number of zones that can have a zrwa), "zoned.zrwas" (zrwa size in LBAs), "zoned.zrwafg" (granularity in LBAs for flushes). Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 171 ++++++++++++++++++++++++++++++++++++++----- hw/nvme/ns.c | 58 +++++++++++++++ hw/nvme/nvme.h | 10 +++ hw/nvme/trace-events | 1 + include/block/nvme.h | 17 ++++- 5 files changed, 237 insertions(+), 20 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 7cb4974c5e..98aac98bef 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -299,26 +299,37 @@ static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone, } } -/* - * Check if we can open a zone without exceeding open/active limits. - * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5). - */ -static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn) +static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act, + uint32_t opn, uint32_t zrwa) { if (ns->params.max_active_zones != 0 && ns->nr_active_zones + act > ns->params.max_active_zones) { trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones); return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR; } + if (ns->params.max_open_zones != 0 && ns->nr_open_zones + opn > ns->params.max_open_zones) { trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones); return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR; } + if (zrwa > ns->zns.numzrwa) { + return NVME_NOZRWA | NVME_DNR; + } + return NVME_SUCCESS; } +/* + * Check if we can open a zone without exceeding open/active limits. + * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5). + */ +static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn) +{ + return nvme_zns_check_resources(ns, act, opn, 0); +} + static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) { hwaddr hi, lo; @@ -1628,9 +1639,19 @@ static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone, return status; } - if (unlikely(slba != zone->w_ptr)) { - trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr); - return NVME_ZONE_INVALID_WRITE; + if (zone->d.za & NVME_ZA_ZRWA_VALID) { + uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas; + + if (slba < zone->w_ptr || slba + nlb > ezrwa) { + trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr); + return NVME_ZONE_INVALID_WRITE; + } + } else { + if (unlikely(slba != zone->w_ptr)) { + trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, + zone->w_ptr); + return NVME_ZONE_INVALID_WRITE; + } } if (unlikely((slba + nlb) > zcap)) { @@ -1710,6 +1731,14 @@ static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone) /* fallthrough */ case NVME_ZONE_STATE_CLOSED: nvme_aor_dec_active(ns); + + if (zone->d.za & NVME_ZA_ZRWA_VALID) { + zone->d.za &= ~NVME_ZA_ZRWA_VALID; + if (ns->params.numzrwa) { + ns->zns.numzrwa++; + } + } + /* fallthrough */ case NVME_ZONE_STATE_EMPTY: nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); @@ -1745,6 +1774,13 @@ static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone) /* fallthrough */ case NVME_ZONE_STATE_CLOSED: nvme_aor_dec_active(ns); + + if (zone->d.za & NVME_ZA_ZRWA_VALID) { + if (ns->params.numzrwa) { + ns->zns.numzrwa++; + } + } + /* fallthrough */ case NVME_ZONE_STATE_FULL: zone->w_ptr = zone->d.zslba; @@ -1778,6 +1814,7 @@ static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns) enum { NVME_ZRM_AUTO = 1 << 0, + NVME_ZRM_ZRWA = 1 << 1, }; static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns, @@ -1796,7 +1833,8 @@ static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns, if (n->params.auto_transition_zones) { nvme_zrm_auto_transition_zone(ns); } - status = nvme_aor_check(ns, act, 1); + status = nvme_zns_check_resources(ns, act, 1, + (flags & NVME_ZRM_ZRWA) ? 1 : 0); if (status) { return status; } @@ -1824,6 +1862,12 @@ static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns, /* fallthrough */ case NVME_ZONE_STATE_EXPLICITLY_OPEN: + if (flags & NVME_ZRM_ZRWA) { + ns->zns.numzrwa--; + + zone->d.za |= NVME_ZA_ZRWA_VALID; + } + return NVME_SUCCESS; default: @@ -1837,12 +1881,6 @@ static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns, return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO); } -static inline uint16_t nvme_zrm_open(NvmeCtrl *n, NvmeNamespace *ns, - NvmeZone *zone) -{ - return nvme_zrm_open_flags(n, ns, zone, 0); -} - static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, uint32_t nlb) { @@ -1853,6 +1891,20 @@ static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, } } +static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone, + uint32_t nlbc) +{ + uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg); + + nlbc = nzrwafgs * ns->zns.zrwafg; + + trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc); + + zone->w_ptr += nlbc; + + nvme_advance_zone_wp(ns, zone, nlbc); +} + static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; @@ -1865,6 +1917,17 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req) zone = nvme_get_zone_by_slba(ns, slba); assert(zone); + if (zone->d.za & NVME_ZA_ZRWA_VALID) { + uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1; + uint64_t elba = slba + nlb - 1; + + if (elba > ezrwa) { + nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa); + } + + return; + } + nvme_advance_zone_wp(ns, zone, nlb); } @@ -2665,7 +2728,9 @@ static void nvme_copy_in_completed_cb(void *opaque, int ret) goto invalid; } - iocb->zone->w_ptr += nlb; + if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) { + iocb->zone->w_ptr += nlb; + } } qemu_iovec_reset(&iocb->iov); @@ -3204,6 +3269,10 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, if (append) { bool piremap = !!(ctrl & NVME_RW_PIREMAP); + if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) { + return NVME_INVALID_ZONE_OP | NVME_DNR; + } + if (unlikely(slba != zone->d.zslba)) { trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba); status = NVME_INVALID_FIELD; @@ -3255,7 +3324,9 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, goto invalid; } - zone->w_ptr += nlb; + if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) { + zone->w_ptr += nlb; + } } data_offset = nvme_l2b(ns, slba); @@ -3339,7 +3410,24 @@ enum NvmeZoneProcessingMask { static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, NvmeZoneState state, NvmeRequest *req) { - return nvme_zrm_open(nvme_ctrl(req), ns, zone); + NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd; + int flags = 0; + + if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) { + uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs); + + if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) { + return NVME_INVALID_ZONE_OP | NVME_DNR; + } + + if (zone->w_ptr % ns->zns.zrwafg) { + return NVME_NOZRWA | NVME_DNR; + } + + flags = NVME_ZRM_ZRWA; + } + + return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags); } static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, @@ -3614,6 +3702,44 @@ done: } } +static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone, + uint64_t elba, NvmeRequest *req) +{ + NvmeNamespace *ns = req->ns; + uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs); + uint64_t wp = zone->d.wp; + uint32_t nlb = elba - wp + 1; + uint16_t status; + + + if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) { + return NVME_INVALID_ZONE_OP | NVME_DNR; + } + + if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + if (elba < wp || elba > wp + ns->zns.zrwas) { + return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR; + } + + if (nlb % ns->zns.zrwafg) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + status = nvme_zrm_auto(n, ns, zone); + if (status) { + return status; + } + + zone->w_ptr += nlb; + + nvme_advance_zone_wp(ns, zone, nlb); + + return NVME_SUCCESS; +} + static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) { NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd; @@ -3640,7 +3766,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) } zone = &ns->zone_array[zone_idx]; - if (slba != zone->d.zslba) { + if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) { trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba); return NVME_INVALID_FIELD | NVME_DNR; } @@ -3716,6 +3842,13 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) } break; + case NVME_ZONE_ACTION_ZRWA_FLUSH: + if (all) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req); + default: trace_pci_nvme_err_invalid_mgmt_action(action); status = NVME_INVALID_FIELD; diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c index 356b6c1c2f..ee673f1a5b 100644 --- a/hw/nvme/ns.c +++ b/hw/nvme/ns.c @@ -275,6 +275,23 @@ static void nvme_ns_init_zoned(NvmeNamespace *ns) ns->params.zd_extension_size >> 6; /* Units of 64B */ } + if (ns->params.zrwas) { + ns->zns.numzrwa = ns->params.numzrwa ? + ns->params.numzrwa : ns->num_zones; + + ns->zns.zrwas = ns->params.zrwas >> ns->lbaf.ds; + ns->zns.zrwafg = ns->params.zrwafg >> ns->lbaf.ds; + + id_ns_z->ozcs |= NVME_ID_NS_ZONED_OZCS_ZRWASUP; + id_ns_z->zrwacap = NVME_ID_NS_ZONED_ZRWACAP_EXPFLUSHSUP; + + id_ns_z->numzrwa = cpu_to_le32(ns->params.numzrwa); + id_ns_z->zrwas = cpu_to_le16(ns->zns.zrwas); + id_ns_z->zrwafg = cpu_to_le16(ns->zns.zrwafg); + } + + id_ns_z->ozcs = cpu_to_le16(id_ns_z->ozcs); + ns->csi = NVME_CSI_ZONED; ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size); ns->id_ns.ncap = ns->id_ns.nsze; @@ -315,6 +332,10 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone) QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry); } else { trace_pci_nvme_clear_ns_reset(state, zone->d.zslba); + if (zone->d.za & NVME_ZA_ZRWA_VALID) { + zone->d.za &= ~NVME_ZA_ZRWA_VALID; + ns->zns.numzrwa++; + } nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY); } } @@ -392,6 +413,40 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp) return -1; } } + + if (ns->params.zrwas) { + if (ns->params.zrwas % ns->blkconf.logical_block_size) { + error_setg(errp, "zone random write area size (zoned.zrwas " + "%"PRIu64") must be a multiple of the logical " + "block size (logical_block_size %"PRIu32")", + ns->params.zrwas, ns->blkconf.logical_block_size); + return -1; + } + + if (ns->params.zrwafg == -1) { + ns->params.zrwafg = ns->blkconf.logical_block_size; + } + + if (ns->params.zrwas % ns->params.zrwafg) { + error_setg(errp, "zone random write area size (zoned.zrwas " + "%"PRIu64") must be a multiple of the zone random " + "write area flush granularity (zoned.zrwafg, " + "%"PRIu64")", ns->params.zrwas, ns->params.zrwafg); + return -1; + } + + if (ns->params.max_active_zones) { + if (ns->params.numzrwa > ns->params.max_active_zones) { + error_setg(errp, "number of zone random write area " + "resources (zoned.numzrwa, %d) must be less " + "than or equal to maximum active resources " + "(zoned.max_active_zones, %d)", + ns->params.numzrwa, + ns->params.max_active_zones); + return -1; + } + } + } } return 0; @@ -551,6 +606,9 @@ static Property nvme_ns_props[] = { params.max_open_zones, 0), DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace, params.zd_extension_size, 0), + DEFINE_PROP_UINT32("zoned.numzrwa", NvmeNamespace, params.numzrwa, 0), + DEFINE_PROP_SIZE("zoned.zrwas", NvmeNamespace, params.zrwas, 0), + DEFINE_PROP_SIZE("zoned.zrwafg", NvmeNamespace, params.zrwafg, -1), DEFINE_PROP_BOOL("eui64-default", NvmeNamespace, params.eui64_default, true), DEFINE_PROP_END_OF_LIST(), diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index 38f3ebf7f6..90c0bb7ce2 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -114,6 +114,10 @@ typedef struct NvmeNamespaceParams { uint32_t max_active_zones; uint32_t max_open_zones; uint32_t zd_extension_size; + + uint32_t numzrwa; + uint64_t zrwas; + uint64_t zrwafg; } NvmeNamespaceParams; typedef struct NvmeNamespace { @@ -130,6 +134,12 @@ typedef struct NvmeNamespace { uint16_t status; int attached; + struct { + uint16_t zrwas; + uint16_t zrwafg; + uint32_t numzrwa; + } zns; + QTAILQ_ENTRY(NvmeNamespace) entry; NvmeIdNsZoned *id_ns_zoned; diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events index ff6cafd520..90730d802f 100644 --- a/hw/nvme/trace-events +++ b/hw/nvme/trace-events @@ -103,6 +103,7 @@ pci_nvme_set_descriptor_extension(uint64_t slba, uint32_t zone_idx) "set zone de pci_nvme_zd_extension_set(uint32_t zone_idx) "set descriptor extension for zone_idx=%"PRIu32"" pci_nvme_clear_ns_close(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Closed state" pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Empty state" +pci_nvme_zoned_zrwa_implicit_flush(uint64_t zslba, uint32_t nlb) "zslba 0x%"PRIx64" nlb %"PRIu32"" # error conditions pci_nvme_err_mdts(size_t len) "len %zu" diff --git a/include/block/nvme.h b/include/block/nvme.h index e10ea6f0eb..cd068ac891 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -890,6 +890,8 @@ enum NvmeStatusCodes { NVME_INVALID_PROT_INFO = 0x0181, NVME_WRITE_TO_RO = 0x0182, NVME_CMD_SIZE_LIMIT = 0x0183, + NVME_INVALID_ZONE_OP = 0x01b6, + NVME_NOZRWA = 0x01b7, NVME_ZONE_BOUNDARY_ERROR = 0x01b8, NVME_ZONE_FULL = 0x01b9, NVME_ZONE_READ_ONLY = 0x01ba, @@ -1345,7 +1347,12 @@ typedef struct QEMU_PACKED NvmeIdNsZoned { uint32_t mor; uint32_t rrl; uint32_t frl; - uint8_t rsvd20[2796]; + uint8_t rsvd12[24]; + uint32_t numzrwa; + uint16_t zrwafg; + uint16_t zrwas; + uint8_t zrwacap; + uint8_t rsvd53[2763]; NvmeLBAFE lbafe[16]; uint8_t rsvd3072[768]; uint8_t vs[256]; @@ -1353,6 +1360,11 @@ typedef struct QEMU_PACKED NvmeIdNsZoned { enum NvmeIdNsZonedOzcs { NVME_ID_NS_ZONED_OZCS_RAZB = 1 << 0, + NVME_ID_NS_ZONED_OZCS_ZRWASUP = 1 << 1, +}; + +enum NvmeIdNsZonedZrwacap { + NVME_ID_NS_ZONED_ZRWACAP_EXPFLUSHSUP = 1 << 0, }; /*Deallocate Logical Block Features*/ @@ -1408,6 +1420,7 @@ enum NvmeZoneAttr { NVME_ZA_FINISHED_BY_CTLR = 1 << 0, NVME_ZA_FINISH_RECOMMENDED = 1 << 1, NVME_ZA_RESET_RECOMMENDED = 1 << 2, + NVME_ZA_ZRWA_VALID = 1 << 3, NVME_ZA_ZD_EXT_VALID = 1 << 7, }; @@ -1460,10 +1473,12 @@ enum NvmeZoneSendAction { NVME_ZONE_ACTION_RESET = 0x04, NVME_ZONE_ACTION_OFFLINE = 0x05, NVME_ZONE_ACTION_SET_ZD_EXT = 0x10, + NVME_ZONE_ACTION_ZRWA_FLUSH = 0x11, }; enum { NVME_ZSFLAG_SELECT_ALL = 1 << 0, + NVME_ZSFLAG_ZRWA_ALLOC = 1 << 1, }; typedef struct QEMU_PACKED NvmeZoneDescr {