xref: /openbmc/qemu/hw/nvme/ctrl.c (revision 6c3a9247)
1 /*
2  * QEMU NVM Express Controller
3  *
4  * Copyright (c) 2012, Intel Corporation
5  *
6  * Written by Keith Busch <keith.busch@intel.com>
7  *
8  * This code is licensed under the GNU GPL v2 or later.
9  */
10 
11 /**
12  * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13  *
14  *  https://nvmexpress.org/developers/nvme-specification/
15  *
16  *
17  * Notes on coding style
18  * ---------------------
19  * While QEMU coding style prefers lowercase hexadecimals in constants, the
20  * NVMe subsystem use thes format from the NVMe specifications in the comments
21  * (i.e. 'h' suffix instead of '0x' prefix).
22  *
23  * Usage
24  * -----
25  * See docs/system/nvme.rst for extensive documentation.
26  *
27  * Add options:
28  *      -drive file=<file>,if=none,id=<drive_id>
29  *      -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30  *      -device nvme,serial=<serial>,id=<bus_name>, \
31  *              cmb_size_mb=<cmb_size_mb[optional]>, \
32  *              [pmrdev=<mem_backend_file_id>,] \
33  *              max_ioqpairs=<N[optional]>, \
34  *              aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35  *              mdts=<N[optional]>,vsl=<N[optional]>, \
36  *              zoned.zasl=<N[optional]>, \
37  *              zoned.auto_transition=<on|off[optional]>, \
38  *              subsys=<subsys_id>
39  *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
40  *              zoned=<true|false[optional]>, \
41  *              subsys=<subsys_id>,detached=<true|false[optional]>
42  *
43  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
44  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
45  * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
46  * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
47  *
48  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
49  * For example:
50  * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
51  *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
52  *
53  * The PMR will use BAR 4/5 exclusively.
54  *
55  * To place controller(s) and namespace(s) to a subsystem, then provide
56  * nvme-subsys device as above.
57  *
58  * nvme subsystem device parameters
59  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
60  * - `nqn`
61  *   This parameter provides the `<nqn_id>` part of the string
62  *   `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
63  *   of subsystem controllers. Note that `<nqn_id>` should be unique per
64  *   subsystem, but this is not enforced by QEMU. If not specified, it will
65  *   default to the value of the `id` parameter (`<subsys_id>`).
66  *
67  * nvme device parameters
68  * ~~~~~~~~~~~~~~~~~~~~~~
69  * - `subsys`
70  *   Specifying this parameter attaches the controller to the subsystem and
71  *   the SUBNQN field in the controller will report the NQN of the subsystem
72  *   device. This also enables multi controller capability represented in
73  *   Identify Controller data structure in CMIC (Controller Multi-path I/O and
74  *   Namesapce Sharing Capabilities).
75  *
76  * - `aerl`
77  *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
78  *   of concurrently outstanding Asynchronous Event Request commands support
79  *   by the controller. This is a 0's based value.
80  *
81  * - `aer_max_queued`
82  *   This is the maximum number of events that the device will enqueue for
83  *   completion when there are no outstanding AERs. When the maximum number of
84  *   enqueued events are reached, subsequent events will be dropped.
85  *
86  * - `mdts`
87  *   Indicates the maximum data transfer size for a command that transfers data
88  *   between host-accessible memory and the controller. The value is specified
89  *   as a power of two (2^n) and is in units of the minimum memory page size
90  *   (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
91  *
92  * - `vsl`
93  *   Indicates the maximum data size limit for the Verify command. Like `mdts`,
94  *   this value is specified as a power of two (2^n) and is in units of the
95  *   minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
96  *   KiB).
97  *
98  * - `zoned.zasl`
99  *   Indicates the maximum data transfer size for the Zone Append command. Like
100  *   `mdts`, the value is specified as a power of two (2^n) and is in units of
101  *   the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
102  *   defaulting to the value of `mdts`).
103  *
104  * - `zoned.auto_transition`
105  *   Indicates if zones in zone state implicitly opened can be automatically
106  *   transitioned to zone state closed for resource management purposes.
107  *   Defaults to 'on'.
108  *
109  * nvme namespace device parameters
110  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
111  * - `shared`
112  *   When the parent nvme device (as defined explicitly by the 'bus' parameter
113  *   or implicitly by the most recently defined NvmeBus) is linked to an
114  *   nvme-subsys device, the namespace will be attached to all controllers in
115  *   the subsystem. If set to 'off' (the default), the namespace will remain a
116  *   private namespace and may only be attached to a single controller at a
117  *   time.
118  *
119  * - `detached`
120  *   This parameter is only valid together with the `subsys` parameter. If left
121  *   at the default value (`false/off`), the namespace will be attached to all
122  *   controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
123  *   namespace will be available in the subsystem but not attached to any
124  *   controllers.
125  *
126  * Setting `zoned` to true selects Zoned Command Set at the namespace.
127  * In this case, the following namespace properties are available to configure
128  * zoned operation:
129  *     zoned.zone_size=<zone size in bytes, default: 128MiB>
130  *         The number may be followed by K, M, G as in kilo-, mega- or giga-.
131  *
132  *     zoned.zone_capacity=<zone capacity in bytes, default: zone size>
133  *         The value 0 (default) forces zone capacity to be the same as zone
134  *         size. The value of this property may not exceed zone size.
135  *
136  *     zoned.descr_ext_size=<zone descriptor extension size, default 0>
137  *         This value needs to be specified in 64B units. If it is zero,
138  *         namespace(s) will not support zone descriptor extensions.
139  *
140  *     zoned.max_active=<Maximum Active Resources (zones), default: 0>
141  *         The default value means there is no limit to the number of
142  *         concurrently active zones.
143  *
144  *     zoned.max_open=<Maximum Open Resources (zones), default: 0>
145  *         The default value means there is no limit to the number of
146  *         concurrently open zones.
147  *
148  *     zoned.cross_read=<enable RAZB, default: false>
149  *         Setting this property to true enables Read Across Zone Boundaries.
150  */
151 
152 #include "qemu/osdep.h"
153 #include "qemu/cutils.h"
154 #include "qemu/error-report.h"
155 #include "qemu/log.h"
156 #include "qemu/units.h"
157 #include "qapi/error.h"
158 #include "qapi/visitor.h"
159 #include "sysemu/sysemu.h"
160 #include "sysemu/block-backend.h"
161 #include "sysemu/hostmem.h"
162 #include "hw/pci/msix.h"
163 #include "migration/vmstate.h"
164 
165 #include "nvme.h"
166 #include "trace.h"
167 
168 #define NVME_MAX_IOQPAIRS 0xffff
169 #define NVME_DB_SIZE  4
170 #define NVME_SPEC_VER 0x00010400
171 #define NVME_CMB_BIR 2
172 #define NVME_PMR_BIR 4
173 #define NVME_TEMPERATURE 0x143
174 #define NVME_TEMPERATURE_WARNING 0x157
175 #define NVME_TEMPERATURE_CRITICAL 0x175
176 #define NVME_NUM_FW_SLOTS 1
177 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
178 
179 #define NVME_GUEST_ERR(trace, fmt, ...) \
180     do { \
181         (trace_##trace)(__VA_ARGS__); \
182         qemu_log_mask(LOG_GUEST_ERROR, #trace \
183             " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
184     } while (0)
185 
186 static const bool nvme_feature_support[NVME_FID_MAX] = {
187     [NVME_ARBITRATION]              = true,
188     [NVME_POWER_MANAGEMENT]         = true,
189     [NVME_TEMPERATURE_THRESHOLD]    = true,
190     [NVME_ERROR_RECOVERY]           = true,
191     [NVME_VOLATILE_WRITE_CACHE]     = true,
192     [NVME_NUMBER_OF_QUEUES]         = true,
193     [NVME_INTERRUPT_COALESCING]     = true,
194     [NVME_INTERRUPT_VECTOR_CONF]    = true,
195     [NVME_WRITE_ATOMICITY]          = true,
196     [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
197     [NVME_TIMESTAMP]                = true,
198     [NVME_COMMAND_SET_PROFILE]      = true,
199 };
200 
201 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
202     [NVME_TEMPERATURE_THRESHOLD]    = NVME_FEAT_CAP_CHANGE,
203     [NVME_ERROR_RECOVERY]           = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
204     [NVME_VOLATILE_WRITE_CACHE]     = NVME_FEAT_CAP_CHANGE,
205     [NVME_NUMBER_OF_QUEUES]         = NVME_FEAT_CAP_CHANGE,
206     [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
207     [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
208     [NVME_COMMAND_SET_PROFILE]      = NVME_FEAT_CAP_CHANGE,
209 };
210 
211 static const uint32_t nvme_cse_acs[256] = {
212     [NVME_ADM_CMD_DELETE_SQ]        = NVME_CMD_EFF_CSUPP,
213     [NVME_ADM_CMD_CREATE_SQ]        = NVME_CMD_EFF_CSUPP,
214     [NVME_ADM_CMD_GET_LOG_PAGE]     = NVME_CMD_EFF_CSUPP,
215     [NVME_ADM_CMD_DELETE_CQ]        = NVME_CMD_EFF_CSUPP,
216     [NVME_ADM_CMD_CREATE_CQ]        = NVME_CMD_EFF_CSUPP,
217     [NVME_ADM_CMD_IDENTIFY]         = NVME_CMD_EFF_CSUPP,
218     [NVME_ADM_CMD_ABORT]            = NVME_CMD_EFF_CSUPP,
219     [NVME_ADM_CMD_SET_FEATURES]     = NVME_CMD_EFF_CSUPP,
220     [NVME_ADM_CMD_GET_FEATURES]     = NVME_CMD_EFF_CSUPP,
221     [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
222     [NVME_ADM_CMD_NS_ATTACHMENT]    = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
223     [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
224 };
225 
226 static const uint32_t nvme_cse_iocs_none[256];
227 
228 static const uint32_t nvme_cse_iocs_nvm[256] = {
229     [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
230     [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
231     [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
232     [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
233     [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
234     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
235     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
236     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
237 };
238 
239 static const uint32_t nvme_cse_iocs_zoned[256] = {
240     [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
241     [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
242     [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
243     [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
244     [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
245     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
246     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
247     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
248     [NVME_CMD_ZONE_APPEND]          = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
249     [NVME_CMD_ZONE_MGMT_SEND]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
250     [NVME_CMD_ZONE_MGMT_RECV]       = NVME_CMD_EFF_CSUPP,
251 };
252 
253 static void nvme_process_sq(void *opaque);
254 
255 static uint16_t nvme_sqid(NvmeRequest *req)
256 {
257     return le16_to_cpu(req->sq->sqid);
258 }
259 
260 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
261                                    NvmeZoneState state)
262 {
263     if (QTAILQ_IN_USE(zone, entry)) {
264         switch (nvme_get_zone_state(zone)) {
265         case NVME_ZONE_STATE_EXPLICITLY_OPEN:
266             QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
267             break;
268         case NVME_ZONE_STATE_IMPLICITLY_OPEN:
269             QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
270             break;
271         case NVME_ZONE_STATE_CLOSED:
272             QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
273             break;
274         case NVME_ZONE_STATE_FULL:
275             QTAILQ_REMOVE(&ns->full_zones, zone, entry);
276         default:
277             ;
278         }
279     }
280 
281     nvme_set_zone_state(zone, state);
282 
283     switch (state) {
284     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
285         QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
286         break;
287     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
288         QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
289         break;
290     case NVME_ZONE_STATE_CLOSED:
291         QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
292         break;
293     case NVME_ZONE_STATE_FULL:
294         QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
295     case NVME_ZONE_STATE_READ_ONLY:
296         break;
297     default:
298         zone->d.za = 0;
299     }
300 }
301 
302 static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
303                                          uint32_t opn, uint32_t zrwa)
304 {
305     if (ns->params.max_active_zones != 0 &&
306         ns->nr_active_zones + act > ns->params.max_active_zones) {
307         trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
308         return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
309     }
310 
311     if (ns->params.max_open_zones != 0 &&
312         ns->nr_open_zones + opn > ns->params.max_open_zones) {
313         trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
314         return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
315     }
316 
317     if (zrwa > ns->zns.numzrwa) {
318         return NVME_NOZRWA | NVME_DNR;
319     }
320 
321     return NVME_SUCCESS;
322 }
323 
324 /*
325  * Check if we can open a zone without exceeding open/active limits.
326  * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
327  */
328 static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
329 {
330     return nvme_zns_check_resources(ns, act, opn, 0);
331 }
332 
333 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
334 {
335     hwaddr hi, lo;
336 
337     if (!n->cmb.cmse) {
338         return false;
339     }
340 
341     lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
342     hi = lo + int128_get64(n->cmb.mem.size);
343 
344     return addr >= lo && addr < hi;
345 }
346 
347 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
348 {
349     hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
350     return &n->cmb.buf[addr - base];
351 }
352 
353 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
354 {
355     hwaddr hi;
356 
357     if (!n->pmr.cmse) {
358         return false;
359     }
360 
361     hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
362 
363     return addr >= n->pmr.cba && addr < hi;
364 }
365 
366 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
367 {
368     return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
369 }
370 
371 static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
372 {
373     hwaddr hi, lo;
374 
375     /*
376      * The purpose of this check is to guard against invalid "local" access to
377      * the iomem (i.e. controller registers). Thus, we check against the range
378      * covered by the 'bar0' MemoryRegion since that is currently composed of
379      * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however,
380      * that if the device model is ever changed to allow the CMB to be located
381      * in BAR0 as well, then this must be changed.
382      */
383     lo = n->bar0.addr;
384     hi = lo + int128_get64(n->bar0.size);
385 
386     return addr >= lo && addr < hi;
387 }
388 
389 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
390 {
391     hwaddr hi = addr + size - 1;
392     if (hi < addr) {
393         return 1;
394     }
395 
396     if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
397         memcpy(buf, nvme_addr_to_cmb(n, addr), size);
398         return 0;
399     }
400 
401     if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
402         memcpy(buf, nvme_addr_to_pmr(n, addr), size);
403         return 0;
404     }
405 
406     return pci_dma_read(&n->parent_obj, addr, buf, size);
407 }
408 
409 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
410 {
411     hwaddr hi = addr + size - 1;
412     if (hi < addr) {
413         return 1;
414     }
415 
416     if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
417         memcpy(nvme_addr_to_cmb(n, addr), buf, size);
418         return 0;
419     }
420 
421     if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
422         memcpy(nvme_addr_to_pmr(n, addr), buf, size);
423         return 0;
424     }
425 
426     return pci_dma_write(&n->parent_obj, addr, buf, size);
427 }
428 
429 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
430 {
431     return nsid &&
432         (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
433 }
434 
435 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
436 {
437     return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
438 }
439 
440 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
441 {
442     return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
443 }
444 
445 static void nvme_inc_cq_tail(NvmeCQueue *cq)
446 {
447     cq->tail++;
448     if (cq->tail >= cq->size) {
449         cq->tail = 0;
450         cq->phase = !cq->phase;
451     }
452 }
453 
454 static void nvme_inc_sq_head(NvmeSQueue *sq)
455 {
456     sq->head = (sq->head + 1) % sq->size;
457 }
458 
459 static uint8_t nvme_cq_full(NvmeCQueue *cq)
460 {
461     return (cq->tail + 1) % cq->size == cq->head;
462 }
463 
464 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
465 {
466     return sq->head == sq->tail;
467 }
468 
469 static void nvme_irq_check(NvmeCtrl *n)
470 {
471     uint32_t intms = ldl_le_p(&n->bar.intms);
472 
473     if (msix_enabled(&(n->parent_obj))) {
474         return;
475     }
476     if (~intms & n->irq_status) {
477         pci_irq_assert(&n->parent_obj);
478     } else {
479         pci_irq_deassert(&n->parent_obj);
480     }
481 }
482 
483 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
484 {
485     if (cq->irq_enabled) {
486         if (msix_enabled(&(n->parent_obj))) {
487             trace_pci_nvme_irq_msix(cq->vector);
488             msix_notify(&(n->parent_obj), cq->vector);
489         } else {
490             trace_pci_nvme_irq_pin();
491             assert(cq->vector < 32);
492             n->irq_status |= 1 << cq->vector;
493             nvme_irq_check(n);
494         }
495     } else {
496         trace_pci_nvme_irq_masked();
497     }
498 }
499 
500 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
501 {
502     if (cq->irq_enabled) {
503         if (msix_enabled(&(n->parent_obj))) {
504             return;
505         } else {
506             assert(cq->vector < 32);
507             if (!n->cq_pending) {
508                 n->irq_status &= ~(1 << cq->vector);
509             }
510             nvme_irq_check(n);
511         }
512     }
513 }
514 
515 static void nvme_req_clear(NvmeRequest *req)
516 {
517     req->ns = NULL;
518     req->opaque = NULL;
519     req->aiocb = NULL;
520     memset(&req->cqe, 0x0, sizeof(req->cqe));
521     req->status = NVME_SUCCESS;
522 }
523 
524 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
525 {
526     if (dma) {
527         pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
528         sg->flags = NVME_SG_DMA;
529     } else {
530         qemu_iovec_init(&sg->iov, 0);
531     }
532 
533     sg->flags |= NVME_SG_ALLOC;
534 }
535 
536 static inline void nvme_sg_unmap(NvmeSg *sg)
537 {
538     if (!(sg->flags & NVME_SG_ALLOC)) {
539         return;
540     }
541 
542     if (sg->flags & NVME_SG_DMA) {
543         qemu_sglist_destroy(&sg->qsg);
544     } else {
545         qemu_iovec_destroy(&sg->iov);
546     }
547 
548     memset(sg, 0x0, sizeof(*sg));
549 }
550 
551 /*
552  * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
553  * holds both data and metadata. This function splits the data and metadata
554  * into two separate QSG/IOVs.
555  */
556 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
557                           NvmeSg *mdata)
558 {
559     NvmeSg *dst = data;
560     uint32_t trans_len, count = ns->lbasz;
561     uint64_t offset = 0;
562     bool dma = sg->flags & NVME_SG_DMA;
563     size_t sge_len;
564     size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
565     int sg_idx = 0;
566 
567     assert(sg->flags & NVME_SG_ALLOC);
568 
569     while (sg_len) {
570         sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
571 
572         trans_len = MIN(sg_len, count);
573         trans_len = MIN(trans_len, sge_len - offset);
574 
575         if (dst) {
576             if (dma) {
577                 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
578                                 trans_len);
579             } else {
580                 qemu_iovec_add(&dst->iov,
581                                sg->iov.iov[sg_idx].iov_base + offset,
582                                trans_len);
583             }
584         }
585 
586         sg_len -= trans_len;
587         count -= trans_len;
588         offset += trans_len;
589 
590         if (count == 0) {
591             dst = (dst == data) ? mdata : data;
592             count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
593         }
594 
595         if (sge_len == offset) {
596             offset = 0;
597             sg_idx++;
598         }
599     }
600 }
601 
602 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
603                                   size_t len)
604 {
605     if (!len) {
606         return NVME_SUCCESS;
607     }
608 
609     trace_pci_nvme_map_addr_cmb(addr, len);
610 
611     if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
612         return NVME_DATA_TRAS_ERROR;
613     }
614 
615     qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
616 
617     return NVME_SUCCESS;
618 }
619 
620 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
621                                   size_t len)
622 {
623     if (!len) {
624         return NVME_SUCCESS;
625     }
626 
627     if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
628         return NVME_DATA_TRAS_ERROR;
629     }
630 
631     qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
632 
633     return NVME_SUCCESS;
634 }
635 
636 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
637 {
638     bool cmb = false, pmr = false;
639 
640     if (!len) {
641         return NVME_SUCCESS;
642     }
643 
644     trace_pci_nvme_map_addr(addr, len);
645 
646     if (nvme_addr_is_iomem(n, addr)) {
647         return NVME_DATA_TRAS_ERROR;
648     }
649 
650     if (nvme_addr_is_cmb(n, addr)) {
651         cmb = true;
652     } else if (nvme_addr_is_pmr(n, addr)) {
653         pmr = true;
654     }
655 
656     if (cmb || pmr) {
657         if (sg->flags & NVME_SG_DMA) {
658             return NVME_INVALID_USE_OF_CMB | NVME_DNR;
659         }
660 
661         if (sg->iov.niov + 1 > IOV_MAX) {
662             goto max_mappings_exceeded;
663         }
664 
665         if (cmb) {
666             return nvme_map_addr_cmb(n, &sg->iov, addr, len);
667         } else {
668             return nvme_map_addr_pmr(n, &sg->iov, addr, len);
669         }
670     }
671 
672     if (!(sg->flags & NVME_SG_DMA)) {
673         return NVME_INVALID_USE_OF_CMB | NVME_DNR;
674     }
675 
676     if (sg->qsg.nsg + 1 > IOV_MAX) {
677         goto max_mappings_exceeded;
678     }
679 
680     qemu_sglist_add(&sg->qsg, addr, len);
681 
682     return NVME_SUCCESS;
683 
684 max_mappings_exceeded:
685     NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
686                    "number of mappings exceed 1024");
687     return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
688 }
689 
690 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
691 {
692     return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
693 }
694 
695 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
696                              uint64_t prp2, uint32_t len)
697 {
698     hwaddr trans_len = n->page_size - (prp1 % n->page_size);
699     trans_len = MIN(len, trans_len);
700     int num_prps = (len >> n->page_bits) + 1;
701     uint16_t status;
702     int ret;
703 
704     trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
705 
706     nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
707 
708     status = nvme_map_addr(n, sg, prp1, trans_len);
709     if (status) {
710         goto unmap;
711     }
712 
713     len -= trans_len;
714     if (len) {
715         if (len > n->page_size) {
716             uint64_t prp_list[n->max_prp_ents];
717             uint32_t nents, prp_trans;
718             int i = 0;
719 
720             /*
721              * The first PRP list entry, pointed to by PRP2 may contain offset.
722              * Hence, we need to calculate the number of entries in based on
723              * that offset.
724              */
725             nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
726             prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
727             ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
728             if (ret) {
729                 trace_pci_nvme_err_addr_read(prp2);
730                 status = NVME_DATA_TRAS_ERROR;
731                 goto unmap;
732             }
733             while (len != 0) {
734                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
735 
736                 if (i == nents - 1 && len > n->page_size) {
737                     if (unlikely(prp_ent & (n->page_size - 1))) {
738                         trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
739                         status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
740                         goto unmap;
741                     }
742 
743                     i = 0;
744                     nents = (len + n->page_size - 1) >> n->page_bits;
745                     nents = MIN(nents, n->max_prp_ents);
746                     prp_trans = nents * sizeof(uint64_t);
747                     ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
748                                          prp_trans);
749                     if (ret) {
750                         trace_pci_nvme_err_addr_read(prp_ent);
751                         status = NVME_DATA_TRAS_ERROR;
752                         goto unmap;
753                     }
754                     prp_ent = le64_to_cpu(prp_list[i]);
755                 }
756 
757                 if (unlikely(prp_ent & (n->page_size - 1))) {
758                     trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
759                     status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
760                     goto unmap;
761                 }
762 
763                 trans_len = MIN(len, n->page_size);
764                 status = nvme_map_addr(n, sg, prp_ent, trans_len);
765                 if (status) {
766                     goto unmap;
767                 }
768 
769                 len -= trans_len;
770                 i++;
771             }
772         } else {
773             if (unlikely(prp2 & (n->page_size - 1))) {
774                 trace_pci_nvme_err_invalid_prp2_align(prp2);
775                 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
776                 goto unmap;
777             }
778             status = nvme_map_addr(n, sg, prp2, len);
779             if (status) {
780                 goto unmap;
781             }
782         }
783     }
784 
785     return NVME_SUCCESS;
786 
787 unmap:
788     nvme_sg_unmap(sg);
789     return status;
790 }
791 
792 /*
793  * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
794  * number of bytes mapped in len.
795  */
796 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
797                                   NvmeSglDescriptor *segment, uint64_t nsgld,
798                                   size_t *len, NvmeCmd *cmd)
799 {
800     dma_addr_t addr, trans_len;
801     uint32_t dlen;
802     uint16_t status;
803 
804     for (int i = 0; i < nsgld; i++) {
805         uint8_t type = NVME_SGL_TYPE(segment[i].type);
806 
807         switch (type) {
808         case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
809             if (cmd->opcode == NVME_CMD_WRITE) {
810                 continue;
811             }
812         case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
813             break;
814         case NVME_SGL_DESCR_TYPE_SEGMENT:
815         case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
816             return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
817         default:
818             return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
819         }
820 
821         dlen = le32_to_cpu(segment[i].len);
822 
823         if (!dlen) {
824             continue;
825         }
826 
827         if (*len == 0) {
828             /*
829              * All data has been mapped, but the SGL contains additional
830              * segments and/or descriptors. The controller might accept
831              * ignoring the rest of the SGL.
832              */
833             uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
834             if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
835                 break;
836             }
837 
838             trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
839             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
840         }
841 
842         trans_len = MIN(*len, dlen);
843 
844         if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
845             goto next;
846         }
847 
848         addr = le64_to_cpu(segment[i].addr);
849 
850         if (UINT64_MAX - addr < dlen) {
851             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
852         }
853 
854         status = nvme_map_addr(n, sg, addr, trans_len);
855         if (status) {
856             return status;
857         }
858 
859 next:
860         *len -= trans_len;
861     }
862 
863     return NVME_SUCCESS;
864 }
865 
866 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
867                              size_t len, NvmeCmd *cmd)
868 {
869     /*
870      * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
871      * dynamically allocating a potentially huge SGL. The spec allows the SGL
872      * to be larger (as in number of bytes required to describe the SGL
873      * descriptors and segment chain) than the command transfer size, so it is
874      * not bounded by MDTS.
875      */
876     const int SEG_CHUNK_SIZE = 256;
877 
878     NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
879     uint64_t nsgld;
880     uint32_t seg_len;
881     uint16_t status;
882     hwaddr addr;
883     int ret;
884 
885     sgld = &sgl;
886     addr = le64_to_cpu(sgl.addr);
887 
888     trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
889 
890     nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
891 
892     /*
893      * If the entire transfer can be described with a single data block it can
894      * be mapped directly.
895      */
896     if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
897         status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
898         if (status) {
899             goto unmap;
900         }
901 
902         goto out;
903     }
904 
905     for (;;) {
906         switch (NVME_SGL_TYPE(sgld->type)) {
907         case NVME_SGL_DESCR_TYPE_SEGMENT:
908         case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
909             break;
910         default:
911             return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
912         }
913 
914         seg_len = le32_to_cpu(sgld->len);
915 
916         /* check the length of the (Last) Segment descriptor */
917         if ((!seg_len || seg_len & 0xf) &&
918             (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
919             return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
920         }
921 
922         if (UINT64_MAX - addr < seg_len) {
923             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
924         }
925 
926         nsgld = seg_len / sizeof(NvmeSglDescriptor);
927 
928         while (nsgld > SEG_CHUNK_SIZE) {
929             if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
930                 trace_pci_nvme_err_addr_read(addr);
931                 status = NVME_DATA_TRAS_ERROR;
932                 goto unmap;
933             }
934 
935             status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
936                                        &len, cmd);
937             if (status) {
938                 goto unmap;
939             }
940 
941             nsgld -= SEG_CHUNK_SIZE;
942             addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
943         }
944 
945         ret = nvme_addr_read(n, addr, segment, nsgld *
946                              sizeof(NvmeSglDescriptor));
947         if (ret) {
948             trace_pci_nvme_err_addr_read(addr);
949             status = NVME_DATA_TRAS_ERROR;
950             goto unmap;
951         }
952 
953         last_sgld = &segment[nsgld - 1];
954 
955         /*
956          * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
957          * then we are done.
958          */
959         switch (NVME_SGL_TYPE(last_sgld->type)) {
960         case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
961         case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
962             status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
963             if (status) {
964                 goto unmap;
965             }
966 
967             goto out;
968 
969         default:
970             break;
971         }
972 
973         /*
974          * If the last descriptor was not a Data Block or Bit Bucket, then the
975          * current segment must not be a Last Segment.
976          */
977         if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
978             status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
979             goto unmap;
980         }
981 
982         sgld = last_sgld;
983         addr = le64_to_cpu(sgld->addr);
984 
985         /*
986          * Do not map the last descriptor; it will be a Segment or Last Segment
987          * descriptor and is handled by the next iteration.
988          */
989         status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
990         if (status) {
991             goto unmap;
992         }
993     }
994 
995 out:
996     /* if there is any residual left in len, the SGL was too short */
997     if (len) {
998         status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
999         goto unmap;
1000     }
1001 
1002     return NVME_SUCCESS;
1003 
1004 unmap:
1005     nvme_sg_unmap(sg);
1006     return status;
1007 }
1008 
1009 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1010                        NvmeCmd *cmd)
1011 {
1012     uint64_t prp1, prp2;
1013 
1014     switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
1015     case NVME_PSDT_PRP:
1016         prp1 = le64_to_cpu(cmd->dptr.prp1);
1017         prp2 = le64_to_cpu(cmd->dptr.prp2);
1018 
1019         return nvme_map_prp(n, sg, prp1, prp2, len);
1020     case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
1021     case NVME_PSDT_SGL_MPTR_SGL:
1022         return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
1023     default:
1024         return NVME_INVALID_FIELD;
1025     }
1026 }
1027 
1028 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1029                               NvmeCmd *cmd)
1030 {
1031     int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
1032     hwaddr mptr = le64_to_cpu(cmd->mptr);
1033     uint16_t status;
1034 
1035     if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1036         NvmeSglDescriptor sgl;
1037 
1038         if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1039             return NVME_DATA_TRAS_ERROR;
1040         }
1041 
1042         status = nvme_map_sgl(n, sg, sgl, len, cmd);
1043         if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1044             status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1045         }
1046 
1047         return status;
1048     }
1049 
1050     nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1051     status = nvme_map_addr(n, sg, mptr, len);
1052     if (status) {
1053         nvme_sg_unmap(sg);
1054     }
1055 
1056     return status;
1057 }
1058 
1059 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1060 {
1061     NvmeNamespace *ns = req->ns;
1062     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1063     bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1064     bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1065     size_t len = nvme_l2b(ns, nlb);
1066     uint16_t status;
1067 
1068     if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1069         NvmeSg sg;
1070 
1071         len += nvme_m2b(ns, nlb);
1072 
1073         status = nvme_map_dptr(n, &sg, len, &req->cmd);
1074         if (status) {
1075             return status;
1076         }
1077 
1078         nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1079         nvme_sg_split(&sg, ns, &req->sg, NULL);
1080         nvme_sg_unmap(&sg);
1081 
1082         return NVME_SUCCESS;
1083     }
1084 
1085     return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1086 }
1087 
1088 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1089 {
1090     NvmeNamespace *ns = req->ns;
1091     size_t len = nvme_m2b(ns, nlb);
1092     uint16_t status;
1093 
1094     if (nvme_ns_ext(ns)) {
1095         NvmeSg sg;
1096 
1097         len += nvme_l2b(ns, nlb);
1098 
1099         status = nvme_map_dptr(n, &sg, len, &req->cmd);
1100         if (status) {
1101             return status;
1102         }
1103 
1104         nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1105         nvme_sg_split(&sg, ns, NULL, &req->sg);
1106         nvme_sg_unmap(&sg);
1107 
1108         return NVME_SUCCESS;
1109     }
1110 
1111     return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1112 }
1113 
1114 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1115                                     uint32_t len, uint32_t bytes,
1116                                     int32_t skip_bytes, int64_t offset,
1117                                     NvmeTxDirection dir)
1118 {
1119     hwaddr addr;
1120     uint32_t trans_len, count = bytes;
1121     bool dma = sg->flags & NVME_SG_DMA;
1122     int64_t sge_len;
1123     int sg_idx = 0;
1124     int ret;
1125 
1126     assert(sg->flags & NVME_SG_ALLOC);
1127 
1128     while (len) {
1129         sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1130 
1131         if (sge_len - offset < 0) {
1132             offset -= sge_len;
1133             sg_idx++;
1134             continue;
1135         }
1136 
1137         if (sge_len == offset) {
1138             offset = 0;
1139             sg_idx++;
1140             continue;
1141         }
1142 
1143         trans_len = MIN(len, count);
1144         trans_len = MIN(trans_len, sge_len - offset);
1145 
1146         if (dma) {
1147             addr = sg->qsg.sg[sg_idx].base + offset;
1148         } else {
1149             addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1150         }
1151 
1152         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1153             ret = nvme_addr_read(n, addr, ptr, trans_len);
1154         } else {
1155             ret = nvme_addr_write(n, addr, ptr, trans_len);
1156         }
1157 
1158         if (ret) {
1159             return NVME_DATA_TRAS_ERROR;
1160         }
1161 
1162         ptr += trans_len;
1163         len -= trans_len;
1164         count -= trans_len;
1165         offset += trans_len;
1166 
1167         if (count == 0) {
1168             count = bytes;
1169             offset += skip_bytes;
1170         }
1171     }
1172 
1173     return NVME_SUCCESS;
1174 }
1175 
1176 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
1177                         NvmeTxDirection dir)
1178 {
1179     assert(sg->flags & NVME_SG_ALLOC);
1180 
1181     if (sg->flags & NVME_SG_DMA) {
1182         const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1183         dma_addr_t residual;
1184 
1185         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1186             dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1187         } else {
1188             dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1189         }
1190 
1191         if (unlikely(residual)) {
1192             trace_pci_nvme_err_invalid_dma();
1193             return NVME_INVALID_FIELD | NVME_DNR;
1194         }
1195     } else {
1196         size_t bytes;
1197 
1198         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1199             bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1200         } else {
1201             bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1202         }
1203 
1204         if (unlikely(bytes != len)) {
1205             trace_pci_nvme_err_invalid_dma();
1206             return NVME_INVALID_FIELD | NVME_DNR;
1207         }
1208     }
1209 
1210     return NVME_SUCCESS;
1211 }
1212 
1213 static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
1214                                 NvmeRequest *req)
1215 {
1216     uint16_t status;
1217 
1218     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1219     if (status) {
1220         return status;
1221     }
1222 
1223     return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1224 }
1225 
1226 static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
1227                                 NvmeRequest *req)
1228 {
1229     uint16_t status;
1230 
1231     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1232     if (status) {
1233         return status;
1234     }
1235 
1236     return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1237 }
1238 
1239 uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
1240                           NvmeTxDirection dir, NvmeRequest *req)
1241 {
1242     NvmeNamespace *ns = req->ns;
1243     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1244     bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1245     bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1246 
1247     if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1248         return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1249                                    ns->lbaf.ms, 0, dir);
1250     }
1251 
1252     return nvme_tx(n, &req->sg, ptr, len, dir);
1253 }
1254 
1255 uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
1256                            NvmeTxDirection dir, NvmeRequest *req)
1257 {
1258     NvmeNamespace *ns = req->ns;
1259     uint16_t status;
1260 
1261     if (nvme_ns_ext(ns)) {
1262         return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1263                                    ns->lbasz, ns->lbasz, dir);
1264     }
1265 
1266     nvme_sg_unmap(&req->sg);
1267 
1268     status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1269     if (status) {
1270         return status;
1271     }
1272 
1273     return nvme_tx(n, &req->sg, ptr, len, dir);
1274 }
1275 
1276 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1277                                  BlockCompletionFunc *cb, NvmeRequest *req)
1278 {
1279     assert(req->sg.flags & NVME_SG_ALLOC);
1280 
1281     if (req->sg.flags & NVME_SG_DMA) {
1282         req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1283                                   cb, req);
1284     } else {
1285         req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1286     }
1287 }
1288 
1289 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1290                                   BlockCompletionFunc *cb, NvmeRequest *req)
1291 {
1292     assert(req->sg.flags & NVME_SG_ALLOC);
1293 
1294     if (req->sg.flags & NVME_SG_DMA) {
1295         req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1296                                    cb, req);
1297     } else {
1298         req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1299     }
1300 }
1301 
1302 static void nvme_post_cqes(void *opaque)
1303 {
1304     NvmeCQueue *cq = opaque;
1305     NvmeCtrl *n = cq->ctrl;
1306     NvmeRequest *req, *next;
1307     bool pending = cq->head != cq->tail;
1308     int ret;
1309 
1310     QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1311         NvmeSQueue *sq;
1312         hwaddr addr;
1313 
1314         if (nvme_cq_full(cq)) {
1315             break;
1316         }
1317 
1318         sq = req->sq;
1319         req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1320         req->cqe.sq_id = cpu_to_le16(sq->sqid);
1321         req->cqe.sq_head = cpu_to_le16(sq->head);
1322         addr = cq->dma_addr + cq->tail * n->cqe_size;
1323         ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1324                             sizeof(req->cqe));
1325         if (ret) {
1326             trace_pci_nvme_err_addr_write(addr);
1327             trace_pci_nvme_err_cfs();
1328             stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1329             break;
1330         }
1331         QTAILQ_REMOVE(&cq->req_list, req, entry);
1332         nvme_inc_cq_tail(cq);
1333         nvme_sg_unmap(&req->sg);
1334         QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1335     }
1336     if (cq->tail != cq->head) {
1337         if (cq->irq_enabled && !pending) {
1338             n->cq_pending++;
1339         }
1340 
1341         nvme_irq_assert(n, cq);
1342     }
1343 }
1344 
1345 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1346 {
1347     assert(cq->cqid == req->sq->cqid);
1348     trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1349                                           le32_to_cpu(req->cqe.result),
1350                                           le32_to_cpu(req->cqe.dw1),
1351                                           req->status);
1352 
1353     if (req->status) {
1354         trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1355                                       req->status, req->cmd.opcode);
1356     }
1357 
1358     QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1359     QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1360     timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1361 }
1362 
1363 static void nvme_process_aers(void *opaque)
1364 {
1365     NvmeCtrl *n = opaque;
1366     NvmeAsyncEvent *event, *next;
1367 
1368     trace_pci_nvme_process_aers(n->aer_queued);
1369 
1370     QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1371         NvmeRequest *req;
1372         NvmeAerResult *result;
1373 
1374         /* can't post cqe if there is nothing to complete */
1375         if (!n->outstanding_aers) {
1376             trace_pci_nvme_no_outstanding_aers();
1377             break;
1378         }
1379 
1380         /* ignore if masked (cqe posted, but event not cleared) */
1381         if (n->aer_mask & (1 << event->result.event_type)) {
1382             trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1383             continue;
1384         }
1385 
1386         QTAILQ_REMOVE(&n->aer_queue, event, entry);
1387         n->aer_queued--;
1388 
1389         n->aer_mask |= 1 << event->result.event_type;
1390         n->outstanding_aers--;
1391 
1392         req = n->aer_reqs[n->outstanding_aers];
1393 
1394         result = (NvmeAerResult *) &req->cqe.result;
1395         result->event_type = event->result.event_type;
1396         result->event_info = event->result.event_info;
1397         result->log_page = event->result.log_page;
1398         g_free(event);
1399 
1400         trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1401                                     result->log_page);
1402 
1403         nvme_enqueue_req_completion(&n->admin_cq, req);
1404     }
1405 }
1406 
1407 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1408                                uint8_t event_info, uint8_t log_page)
1409 {
1410     NvmeAsyncEvent *event;
1411 
1412     trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1413 
1414     if (n->aer_queued == n->params.aer_max_queued) {
1415         trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1416         return;
1417     }
1418 
1419     event = g_new(NvmeAsyncEvent, 1);
1420     event->result = (NvmeAerResult) {
1421         .event_type = event_type,
1422         .event_info = event_info,
1423         .log_page   = log_page,
1424     };
1425 
1426     QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1427     n->aer_queued++;
1428 
1429     nvme_process_aers(n);
1430 }
1431 
1432 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1433 {
1434     uint8_t aer_info;
1435 
1436     /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1437     if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1438         return;
1439     }
1440 
1441     switch (event) {
1442     case NVME_SMART_SPARE:
1443         aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1444         break;
1445     case NVME_SMART_TEMPERATURE:
1446         aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1447         break;
1448     case NVME_SMART_RELIABILITY:
1449     case NVME_SMART_MEDIA_READ_ONLY:
1450     case NVME_SMART_FAILED_VOLATILE_MEDIA:
1451     case NVME_SMART_PMR_UNRELIABLE:
1452         aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1453         break;
1454     default:
1455         return;
1456     }
1457 
1458     nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1459 }
1460 
1461 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1462 {
1463     n->aer_mask &= ~(1 << event_type);
1464     if (!QTAILQ_EMPTY(&n->aer_queue)) {
1465         nvme_process_aers(n);
1466     }
1467 }
1468 
1469 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1470 {
1471     uint8_t mdts = n->params.mdts;
1472 
1473     if (mdts && len > n->page_size << mdts) {
1474         trace_pci_nvme_err_mdts(len);
1475         return NVME_INVALID_FIELD | NVME_DNR;
1476     }
1477 
1478     return NVME_SUCCESS;
1479 }
1480 
1481 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1482                                          uint32_t nlb)
1483 {
1484     uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1485 
1486     if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1487         trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1488         return NVME_LBA_RANGE | NVME_DNR;
1489     }
1490 
1491     return NVME_SUCCESS;
1492 }
1493 
1494 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1495                                  uint32_t nlb, int flags)
1496 {
1497     BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1498 
1499     int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1500     int64_t offset = nvme_l2b(ns, slba);
1501     int ret;
1502 
1503     /*
1504      * `pnum` holds the number of bytes after offset that shares the same
1505      * allocation status as the byte at offset. If `pnum` is different from
1506      * `bytes`, we should check the allocation status of the next range and
1507      * continue this until all bytes have been checked.
1508      */
1509     do {
1510         bytes -= pnum;
1511 
1512         ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1513         if (ret < 0) {
1514             return ret;
1515         }
1516 
1517 
1518         trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1519                                     !!(ret & BDRV_BLOCK_ZERO));
1520 
1521         if (!(ret & flags)) {
1522             return 1;
1523         }
1524 
1525         offset += pnum;
1526     } while (pnum != bytes);
1527 
1528     return 0;
1529 }
1530 
1531 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1532                                  uint32_t nlb)
1533 {
1534     int ret;
1535     Error *err = NULL;
1536 
1537     ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1538     if (ret) {
1539         if (ret < 0) {
1540             error_setg_errno(&err, -ret, "unable to get block status");
1541             error_report_err(err);
1542 
1543             return NVME_INTERNAL_DEV_ERROR;
1544         }
1545 
1546         return NVME_DULB;
1547     }
1548 
1549     return NVME_SUCCESS;
1550 }
1551 
1552 static void nvme_aio_err(NvmeRequest *req, int ret)
1553 {
1554     uint16_t status = NVME_SUCCESS;
1555     Error *local_err = NULL;
1556 
1557     switch (req->cmd.opcode) {
1558     case NVME_CMD_READ:
1559         status = NVME_UNRECOVERED_READ;
1560         break;
1561     case NVME_CMD_FLUSH:
1562     case NVME_CMD_WRITE:
1563     case NVME_CMD_WRITE_ZEROES:
1564     case NVME_CMD_ZONE_APPEND:
1565         status = NVME_WRITE_FAULT;
1566         break;
1567     default:
1568         status = NVME_INTERNAL_DEV_ERROR;
1569         break;
1570     }
1571 
1572     trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1573 
1574     error_setg_errno(&local_err, -ret, "aio failed");
1575     error_report_err(local_err);
1576 
1577     /*
1578      * Set the command status code to the first encountered error but allow a
1579      * subsequent Internal Device Error to trump it.
1580      */
1581     if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1582         return;
1583     }
1584 
1585     req->status = status;
1586 }
1587 
1588 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1589 {
1590     return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1591                                     slba / ns->zone_size;
1592 }
1593 
1594 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1595 {
1596     uint32_t zone_idx = nvme_zone_idx(ns, slba);
1597 
1598     if (zone_idx >= ns->num_zones) {
1599         return NULL;
1600     }
1601 
1602     return &ns->zone_array[zone_idx];
1603 }
1604 
1605 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1606 {
1607     uint64_t zslba = zone->d.zslba;
1608 
1609     switch (nvme_get_zone_state(zone)) {
1610     case NVME_ZONE_STATE_EMPTY:
1611     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1612     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1613     case NVME_ZONE_STATE_CLOSED:
1614         return NVME_SUCCESS;
1615     case NVME_ZONE_STATE_FULL:
1616         trace_pci_nvme_err_zone_is_full(zslba);
1617         return NVME_ZONE_FULL;
1618     case NVME_ZONE_STATE_OFFLINE:
1619         trace_pci_nvme_err_zone_is_offline(zslba);
1620         return NVME_ZONE_OFFLINE;
1621     case NVME_ZONE_STATE_READ_ONLY:
1622         trace_pci_nvme_err_zone_is_read_only(zslba);
1623         return NVME_ZONE_READ_ONLY;
1624     default:
1625         assert(false);
1626     }
1627 
1628     return NVME_INTERNAL_DEV_ERROR;
1629 }
1630 
1631 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1632                                       uint64_t slba, uint32_t nlb)
1633 {
1634     uint64_t zcap = nvme_zone_wr_boundary(zone);
1635     uint16_t status;
1636 
1637     status = nvme_check_zone_state_for_write(zone);
1638     if (status) {
1639         return status;
1640     }
1641 
1642     if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1643         uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
1644 
1645         if (slba < zone->w_ptr || slba + nlb > ezrwa) {
1646             trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
1647             return NVME_ZONE_INVALID_WRITE;
1648         }
1649     } else {
1650         if (unlikely(slba != zone->w_ptr)) {
1651             trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
1652                                                zone->w_ptr);
1653             return NVME_ZONE_INVALID_WRITE;
1654         }
1655     }
1656 
1657     if (unlikely((slba + nlb) > zcap)) {
1658         trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1659         return NVME_ZONE_BOUNDARY_ERROR;
1660     }
1661 
1662     return NVME_SUCCESS;
1663 }
1664 
1665 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1666 {
1667     switch (nvme_get_zone_state(zone)) {
1668     case NVME_ZONE_STATE_EMPTY:
1669     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1670     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1671     case NVME_ZONE_STATE_FULL:
1672     case NVME_ZONE_STATE_CLOSED:
1673     case NVME_ZONE_STATE_READ_ONLY:
1674         return NVME_SUCCESS;
1675     case NVME_ZONE_STATE_OFFLINE:
1676         trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1677         return NVME_ZONE_OFFLINE;
1678     default:
1679         assert(false);
1680     }
1681 
1682     return NVME_INTERNAL_DEV_ERROR;
1683 }
1684 
1685 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1686                                      uint32_t nlb)
1687 {
1688     NvmeZone *zone;
1689     uint64_t bndry, end;
1690     uint16_t status;
1691 
1692     zone = nvme_get_zone_by_slba(ns, slba);
1693     assert(zone);
1694 
1695     bndry = nvme_zone_rd_boundary(ns, zone);
1696     end = slba + nlb;
1697 
1698     status = nvme_check_zone_state_for_read(zone);
1699     if (status) {
1700         ;
1701     } else if (unlikely(end > bndry)) {
1702         if (!ns->params.cross_zone_read) {
1703             status = NVME_ZONE_BOUNDARY_ERROR;
1704         } else {
1705             /*
1706              * Read across zone boundary - check that all subsequent
1707              * zones that are being read have an appropriate state.
1708              */
1709             do {
1710                 zone++;
1711                 status = nvme_check_zone_state_for_read(zone);
1712                 if (status) {
1713                     break;
1714                 }
1715             } while (end > nvme_zone_rd_boundary(ns, zone));
1716         }
1717     }
1718 
1719     return status;
1720 }
1721 
1722 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1723 {
1724     switch (nvme_get_zone_state(zone)) {
1725     case NVME_ZONE_STATE_FULL:
1726         return NVME_SUCCESS;
1727 
1728     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1729     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1730         nvme_aor_dec_open(ns);
1731         /* fallthrough */
1732     case NVME_ZONE_STATE_CLOSED:
1733         nvme_aor_dec_active(ns);
1734 
1735         if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1736             zone->d.za &= ~NVME_ZA_ZRWA_VALID;
1737             if (ns->params.numzrwa) {
1738                 ns->zns.numzrwa++;
1739             }
1740         }
1741 
1742         /* fallthrough */
1743     case NVME_ZONE_STATE_EMPTY:
1744         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1745         return NVME_SUCCESS;
1746 
1747     default:
1748         return NVME_ZONE_INVAL_TRANSITION;
1749     }
1750 }
1751 
1752 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1753 {
1754     switch (nvme_get_zone_state(zone)) {
1755     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1756     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1757         nvme_aor_dec_open(ns);
1758         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1759         /* fall through */
1760     case NVME_ZONE_STATE_CLOSED:
1761         return NVME_SUCCESS;
1762 
1763     default:
1764         return NVME_ZONE_INVAL_TRANSITION;
1765     }
1766 }
1767 
1768 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1769 {
1770     switch (nvme_get_zone_state(zone)) {
1771     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1772     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1773         nvme_aor_dec_open(ns);
1774         /* fallthrough */
1775     case NVME_ZONE_STATE_CLOSED:
1776         nvme_aor_dec_active(ns);
1777 
1778         if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1779             if (ns->params.numzrwa) {
1780                 ns->zns.numzrwa++;
1781             }
1782         }
1783 
1784         /* fallthrough */
1785     case NVME_ZONE_STATE_FULL:
1786         zone->w_ptr = zone->d.zslba;
1787         zone->d.wp = zone->w_ptr;
1788         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1789         /* fallthrough */
1790     case NVME_ZONE_STATE_EMPTY:
1791         return NVME_SUCCESS;
1792 
1793     default:
1794         return NVME_ZONE_INVAL_TRANSITION;
1795     }
1796 }
1797 
1798 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1799 {
1800     NvmeZone *zone;
1801 
1802     if (ns->params.max_open_zones &&
1803         ns->nr_open_zones == ns->params.max_open_zones) {
1804         zone = QTAILQ_FIRST(&ns->imp_open_zones);
1805         if (zone) {
1806             /*
1807              * Automatically close this implicitly open zone.
1808              */
1809             QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1810             nvme_zrm_close(ns, zone);
1811         }
1812     }
1813 }
1814 
1815 enum {
1816     NVME_ZRM_AUTO = 1 << 0,
1817     NVME_ZRM_ZRWA = 1 << 1,
1818 };
1819 
1820 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1821                                     NvmeZone *zone, int flags)
1822 {
1823     int act = 0;
1824     uint16_t status;
1825 
1826     switch (nvme_get_zone_state(zone)) {
1827     case NVME_ZONE_STATE_EMPTY:
1828         act = 1;
1829 
1830         /* fallthrough */
1831 
1832     case NVME_ZONE_STATE_CLOSED:
1833         if (n->params.auto_transition_zones) {
1834             nvme_zrm_auto_transition_zone(ns);
1835         }
1836         status = nvme_zns_check_resources(ns, act, 1,
1837                                           (flags & NVME_ZRM_ZRWA) ? 1 : 0);
1838         if (status) {
1839             return status;
1840         }
1841 
1842         if (act) {
1843             nvme_aor_inc_active(ns);
1844         }
1845 
1846         nvme_aor_inc_open(ns);
1847 
1848         if (flags & NVME_ZRM_AUTO) {
1849             nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1850             return NVME_SUCCESS;
1851         }
1852 
1853         /* fallthrough */
1854 
1855     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1856         if (flags & NVME_ZRM_AUTO) {
1857             return NVME_SUCCESS;
1858         }
1859 
1860         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1861 
1862         /* fallthrough */
1863 
1864     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1865         if (flags & NVME_ZRM_ZRWA) {
1866             ns->zns.numzrwa--;
1867 
1868             zone->d.za |= NVME_ZA_ZRWA_VALID;
1869         }
1870 
1871         return NVME_SUCCESS;
1872 
1873     default:
1874         return NVME_ZONE_INVAL_TRANSITION;
1875     }
1876 }
1877 
1878 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
1879                                      NvmeZone *zone)
1880 {
1881     return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
1882 }
1883 
1884 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1885                                  uint32_t nlb)
1886 {
1887     zone->d.wp += nlb;
1888 
1889     if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1890         nvme_zrm_finish(ns, zone);
1891     }
1892 }
1893 
1894 static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone,
1895                                            uint32_t nlbc)
1896 {
1897     uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg);
1898 
1899     nlbc = nzrwafgs * ns->zns.zrwafg;
1900 
1901     trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc);
1902 
1903     zone->w_ptr += nlbc;
1904 
1905     nvme_advance_zone_wp(ns, zone, nlbc);
1906 }
1907 
1908 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1909 {
1910     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1911     NvmeZone *zone;
1912     uint64_t slba;
1913     uint32_t nlb;
1914 
1915     slba = le64_to_cpu(rw->slba);
1916     nlb = le16_to_cpu(rw->nlb) + 1;
1917     zone = nvme_get_zone_by_slba(ns, slba);
1918     assert(zone);
1919 
1920     if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1921         uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1;
1922         uint64_t elba = slba + nlb - 1;
1923 
1924         if (elba > ezrwa) {
1925             nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa);
1926         }
1927 
1928         return;
1929     }
1930 
1931     nvme_advance_zone_wp(ns, zone, nlb);
1932 }
1933 
1934 static inline bool nvme_is_write(NvmeRequest *req)
1935 {
1936     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1937 
1938     return rw->opcode == NVME_CMD_WRITE ||
1939            rw->opcode == NVME_CMD_ZONE_APPEND ||
1940            rw->opcode == NVME_CMD_WRITE_ZEROES;
1941 }
1942 
1943 static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
1944 {
1945     return qemu_get_aio_context();
1946 }
1947 
1948 static void nvme_misc_cb(void *opaque, int ret)
1949 {
1950     NvmeRequest *req = opaque;
1951 
1952     trace_pci_nvme_misc_cb(nvme_cid(req));
1953 
1954     if (ret) {
1955         nvme_aio_err(req, ret);
1956     }
1957 
1958     nvme_enqueue_req_completion(nvme_cq(req), req);
1959 }
1960 
1961 void nvme_rw_complete_cb(void *opaque, int ret)
1962 {
1963     NvmeRequest *req = opaque;
1964     NvmeNamespace *ns = req->ns;
1965     BlockBackend *blk = ns->blkconf.blk;
1966     BlockAcctCookie *acct = &req->acct;
1967     BlockAcctStats *stats = blk_get_stats(blk);
1968 
1969     trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1970 
1971     if (ret) {
1972         block_acct_failed(stats, acct);
1973         nvme_aio_err(req, ret);
1974     } else {
1975         block_acct_done(stats, acct);
1976     }
1977 
1978     if (ns->params.zoned && nvme_is_write(req)) {
1979         nvme_finalize_zoned_write(ns, req);
1980     }
1981 
1982     nvme_enqueue_req_completion(nvme_cq(req), req);
1983 }
1984 
1985 static void nvme_rw_cb(void *opaque, int ret)
1986 {
1987     NvmeRequest *req = opaque;
1988     NvmeNamespace *ns = req->ns;
1989 
1990     BlockBackend *blk = ns->blkconf.blk;
1991 
1992     trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1993 
1994     if (ret) {
1995         goto out;
1996     }
1997 
1998     if (ns->lbaf.ms) {
1999         NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2000         uint64_t slba = le64_to_cpu(rw->slba);
2001         uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2002         uint64_t offset = nvme_moff(ns, slba);
2003 
2004         if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
2005             size_t mlen = nvme_m2b(ns, nlb);
2006 
2007             req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
2008                                                BDRV_REQ_MAY_UNMAP,
2009                                                nvme_rw_complete_cb, req);
2010             return;
2011         }
2012 
2013         if (nvme_ns_ext(ns) || req->cmd.mptr) {
2014             uint16_t status;
2015 
2016             nvme_sg_unmap(&req->sg);
2017             status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
2018             if (status) {
2019                 ret = -EFAULT;
2020                 goto out;
2021             }
2022 
2023             if (req->cmd.opcode == NVME_CMD_READ) {
2024                 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
2025             }
2026 
2027             return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
2028         }
2029     }
2030 
2031 out:
2032     nvme_rw_complete_cb(req, ret);
2033 }
2034 
2035 static void nvme_verify_cb(void *opaque, int ret)
2036 {
2037     NvmeBounceContext *ctx = opaque;
2038     NvmeRequest *req = ctx->req;
2039     NvmeNamespace *ns = req->ns;
2040     BlockBackend *blk = ns->blkconf.blk;
2041     BlockAcctCookie *acct = &req->acct;
2042     BlockAcctStats *stats = blk_get_stats(blk);
2043     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2044     uint64_t slba = le64_to_cpu(rw->slba);
2045     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2046     uint16_t apptag = le16_to_cpu(rw->apptag);
2047     uint16_t appmask = le16_to_cpu(rw->appmask);
2048     uint32_t reftag = le32_to_cpu(rw->reftag);
2049     uint16_t status;
2050 
2051     trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
2052 
2053     if (ret) {
2054         block_acct_failed(stats, acct);
2055         nvme_aio_err(req, ret);
2056         goto out;
2057     }
2058 
2059     block_acct_done(stats, acct);
2060 
2061     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2062         status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
2063                                        ctx->mdata.iov.size, slba);
2064         if (status) {
2065             req->status = status;
2066             goto out;
2067         }
2068 
2069         req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2070                                      ctx->mdata.bounce, ctx->mdata.iov.size,
2071                                      prinfo, slba, apptag, appmask, &reftag);
2072     }
2073 
2074 out:
2075     qemu_iovec_destroy(&ctx->data.iov);
2076     g_free(ctx->data.bounce);
2077 
2078     qemu_iovec_destroy(&ctx->mdata.iov);
2079     g_free(ctx->mdata.bounce);
2080 
2081     g_free(ctx);
2082 
2083     nvme_enqueue_req_completion(nvme_cq(req), req);
2084 }
2085 
2086 
2087 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2088 {
2089     NvmeBounceContext *ctx = opaque;
2090     NvmeRequest *req = ctx->req;
2091     NvmeNamespace *ns = req->ns;
2092     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2093     uint64_t slba = le64_to_cpu(rw->slba);
2094     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2095     size_t mlen = nvme_m2b(ns, nlb);
2096     uint64_t offset = nvme_moff(ns, slba);
2097     BlockBackend *blk = ns->blkconf.blk;
2098 
2099     trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2100 
2101     if (ret) {
2102         goto out;
2103     }
2104 
2105     ctx->mdata.bounce = g_malloc(mlen);
2106 
2107     qemu_iovec_reset(&ctx->mdata.iov);
2108     qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2109 
2110     req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2111                                 nvme_verify_cb, ctx);
2112     return;
2113 
2114 out:
2115     nvme_verify_cb(ctx, ret);
2116 }
2117 
2118 struct nvme_compare_ctx {
2119     struct {
2120         QEMUIOVector iov;
2121         uint8_t *bounce;
2122     } data;
2123 
2124     struct {
2125         QEMUIOVector iov;
2126         uint8_t *bounce;
2127     } mdata;
2128 };
2129 
2130 static void nvme_compare_mdata_cb(void *opaque, int ret)
2131 {
2132     NvmeRequest *req = opaque;
2133     NvmeNamespace *ns = req->ns;
2134     NvmeCtrl *n = nvme_ctrl(req);
2135     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2136     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2137     uint16_t apptag = le16_to_cpu(rw->apptag);
2138     uint16_t appmask = le16_to_cpu(rw->appmask);
2139     uint32_t reftag = le32_to_cpu(rw->reftag);
2140     struct nvme_compare_ctx *ctx = req->opaque;
2141     g_autofree uint8_t *buf = NULL;
2142     BlockBackend *blk = ns->blkconf.blk;
2143     BlockAcctCookie *acct = &req->acct;
2144     BlockAcctStats *stats = blk_get_stats(blk);
2145     uint16_t status = NVME_SUCCESS;
2146 
2147     trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2148 
2149     if (ret) {
2150         block_acct_failed(stats, acct);
2151         nvme_aio_err(req, ret);
2152         goto out;
2153     }
2154 
2155     buf = g_malloc(ctx->mdata.iov.size);
2156 
2157     status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2158                                NVME_TX_DIRECTION_TO_DEVICE, req);
2159     if (status) {
2160         req->status = status;
2161         goto out;
2162     }
2163 
2164     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2165         uint64_t slba = le64_to_cpu(rw->slba);
2166         uint8_t *bufp;
2167         uint8_t *mbufp = ctx->mdata.bounce;
2168         uint8_t *end = mbufp + ctx->mdata.iov.size;
2169         int16_t pil = 0;
2170 
2171         status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2172                                 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2173                                 slba, apptag, appmask, &reftag);
2174         if (status) {
2175             req->status = status;
2176             goto out;
2177         }
2178 
2179         /*
2180          * When formatted with protection information, do not compare the DIF
2181          * tuple.
2182          */
2183         if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2184             pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
2185         }
2186 
2187         for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2188             if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2189                 req->status = NVME_CMP_FAILURE;
2190                 goto out;
2191             }
2192         }
2193 
2194         goto out;
2195     }
2196 
2197     if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2198         req->status = NVME_CMP_FAILURE;
2199         goto out;
2200     }
2201 
2202     block_acct_done(stats, acct);
2203 
2204 out:
2205     qemu_iovec_destroy(&ctx->data.iov);
2206     g_free(ctx->data.bounce);
2207 
2208     qemu_iovec_destroy(&ctx->mdata.iov);
2209     g_free(ctx->mdata.bounce);
2210 
2211     g_free(ctx);
2212 
2213     nvme_enqueue_req_completion(nvme_cq(req), req);
2214 }
2215 
2216 static void nvme_compare_data_cb(void *opaque, int ret)
2217 {
2218     NvmeRequest *req = opaque;
2219     NvmeCtrl *n = nvme_ctrl(req);
2220     NvmeNamespace *ns = req->ns;
2221     BlockBackend *blk = ns->blkconf.blk;
2222     BlockAcctCookie *acct = &req->acct;
2223     BlockAcctStats *stats = blk_get_stats(blk);
2224 
2225     struct nvme_compare_ctx *ctx = req->opaque;
2226     g_autofree uint8_t *buf = NULL;
2227     uint16_t status;
2228 
2229     trace_pci_nvme_compare_data_cb(nvme_cid(req));
2230 
2231     if (ret) {
2232         block_acct_failed(stats, acct);
2233         nvme_aio_err(req, ret);
2234         goto out;
2235     }
2236 
2237     buf = g_malloc(ctx->data.iov.size);
2238 
2239     status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2240                               NVME_TX_DIRECTION_TO_DEVICE, req);
2241     if (status) {
2242         req->status = status;
2243         goto out;
2244     }
2245 
2246     if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2247         req->status = NVME_CMP_FAILURE;
2248         goto out;
2249     }
2250 
2251     if (ns->lbaf.ms) {
2252         NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2253         uint64_t slba = le64_to_cpu(rw->slba);
2254         uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2255         size_t mlen = nvme_m2b(ns, nlb);
2256         uint64_t offset = nvme_moff(ns, slba);
2257 
2258         ctx->mdata.bounce = g_malloc(mlen);
2259 
2260         qemu_iovec_init(&ctx->mdata.iov, 1);
2261         qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2262 
2263         req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2264                                     nvme_compare_mdata_cb, req);
2265         return;
2266     }
2267 
2268     block_acct_done(stats, acct);
2269 
2270 out:
2271     qemu_iovec_destroy(&ctx->data.iov);
2272     g_free(ctx->data.bounce);
2273     g_free(ctx);
2274 
2275     nvme_enqueue_req_completion(nvme_cq(req), req);
2276 }
2277 
2278 typedef struct NvmeDSMAIOCB {
2279     BlockAIOCB common;
2280     BlockAIOCB *aiocb;
2281     NvmeRequest *req;
2282     QEMUBH *bh;
2283     int ret;
2284 
2285     NvmeDsmRange *range;
2286     unsigned int nr;
2287     unsigned int idx;
2288 } NvmeDSMAIOCB;
2289 
2290 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2291 {
2292     NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2293 
2294     /* break nvme_dsm_cb loop */
2295     iocb->idx = iocb->nr;
2296     iocb->ret = -ECANCELED;
2297 
2298     if (iocb->aiocb) {
2299         blk_aio_cancel_async(iocb->aiocb);
2300         iocb->aiocb = NULL;
2301     } else {
2302         /*
2303          * We only reach this if nvme_dsm_cancel() has already been called or
2304          * the command ran to completion and nvme_dsm_bh is scheduled to run.
2305          */
2306         assert(iocb->idx == iocb->nr);
2307     }
2308 }
2309 
2310 static const AIOCBInfo nvme_dsm_aiocb_info = {
2311     .aiocb_size   = sizeof(NvmeDSMAIOCB),
2312     .cancel_async = nvme_dsm_cancel,
2313 };
2314 
2315 static void nvme_dsm_bh(void *opaque)
2316 {
2317     NvmeDSMAIOCB *iocb = opaque;
2318 
2319     iocb->common.cb(iocb->common.opaque, iocb->ret);
2320 
2321     qemu_bh_delete(iocb->bh);
2322     iocb->bh = NULL;
2323     qemu_aio_unref(iocb);
2324 }
2325 
2326 static void nvme_dsm_cb(void *opaque, int ret);
2327 
2328 static void nvme_dsm_md_cb(void *opaque, int ret)
2329 {
2330     NvmeDSMAIOCB *iocb = opaque;
2331     NvmeRequest *req = iocb->req;
2332     NvmeNamespace *ns = req->ns;
2333     NvmeDsmRange *range;
2334     uint64_t slba;
2335     uint32_t nlb;
2336 
2337     if (ret < 0) {
2338         iocb->ret = ret;
2339         goto done;
2340     }
2341 
2342     if (!ns->lbaf.ms) {
2343         nvme_dsm_cb(iocb, 0);
2344         return;
2345     }
2346 
2347     range = &iocb->range[iocb->idx - 1];
2348     slba = le64_to_cpu(range->slba);
2349     nlb = le32_to_cpu(range->nlb);
2350 
2351     /*
2352      * Check that all block were discarded (zeroed); otherwise we do not zero
2353      * the metadata.
2354      */
2355 
2356     ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2357     if (ret) {
2358         if (ret < 0) {
2359             iocb->ret = ret;
2360             goto done;
2361         }
2362 
2363         nvme_dsm_cb(iocb, 0);
2364     }
2365 
2366     iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2367                                         nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2368                                         nvme_dsm_cb, iocb);
2369     return;
2370 
2371 done:
2372     iocb->aiocb = NULL;
2373     qemu_bh_schedule(iocb->bh);
2374 }
2375 
2376 static void nvme_dsm_cb(void *opaque, int ret)
2377 {
2378     NvmeDSMAIOCB *iocb = opaque;
2379     NvmeRequest *req = iocb->req;
2380     NvmeCtrl *n = nvme_ctrl(req);
2381     NvmeNamespace *ns = req->ns;
2382     NvmeDsmRange *range;
2383     uint64_t slba;
2384     uint32_t nlb;
2385 
2386     if (ret < 0) {
2387         iocb->ret = ret;
2388         goto done;
2389     }
2390 
2391 next:
2392     if (iocb->idx == iocb->nr) {
2393         goto done;
2394     }
2395 
2396     range = &iocb->range[iocb->idx++];
2397     slba = le64_to_cpu(range->slba);
2398     nlb = le32_to_cpu(range->nlb);
2399 
2400     trace_pci_nvme_dsm_deallocate(slba, nlb);
2401 
2402     if (nlb > n->dmrsl) {
2403         trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2404         goto next;
2405     }
2406 
2407     if (nvme_check_bounds(ns, slba, nlb)) {
2408         trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2409                                              ns->id_ns.nsze);
2410         goto next;
2411     }
2412 
2413     iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2414                                    nvme_l2b(ns, nlb),
2415                                    nvme_dsm_md_cb, iocb);
2416     return;
2417 
2418 done:
2419     iocb->aiocb = NULL;
2420     qemu_bh_schedule(iocb->bh);
2421 }
2422 
2423 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2424 {
2425     NvmeNamespace *ns = req->ns;
2426     NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2427     uint32_t attr = le32_to_cpu(dsm->attributes);
2428     uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2429     uint16_t status = NVME_SUCCESS;
2430 
2431     trace_pci_nvme_dsm(nr, attr);
2432 
2433     if (attr & NVME_DSMGMT_AD) {
2434         NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2435                                          nvme_misc_cb, req);
2436 
2437         iocb->req = req;
2438         iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
2439         iocb->ret = 0;
2440         iocb->range = g_new(NvmeDsmRange, nr);
2441         iocb->nr = nr;
2442         iocb->idx = 0;
2443 
2444         status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2445                           req);
2446         if (status) {
2447             return status;
2448         }
2449 
2450         req->aiocb = &iocb->common;
2451         nvme_dsm_cb(iocb, 0);
2452 
2453         return NVME_NO_COMPLETE;
2454     }
2455 
2456     return status;
2457 }
2458 
2459 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2460 {
2461     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2462     NvmeNamespace *ns = req->ns;
2463     BlockBackend *blk = ns->blkconf.blk;
2464     uint64_t slba = le64_to_cpu(rw->slba);
2465     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2466     size_t len = nvme_l2b(ns, nlb);
2467     int64_t offset = nvme_l2b(ns, slba);
2468     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2469     uint32_t reftag = le32_to_cpu(rw->reftag);
2470     NvmeBounceContext *ctx = NULL;
2471     uint16_t status;
2472 
2473     trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2474 
2475     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2476         status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2477         if (status) {
2478             return status;
2479         }
2480 
2481         if (prinfo & NVME_PRINFO_PRACT) {
2482             return NVME_INVALID_PROT_INFO | NVME_DNR;
2483         }
2484     }
2485 
2486     if (len > n->page_size << n->params.vsl) {
2487         return NVME_INVALID_FIELD | NVME_DNR;
2488     }
2489 
2490     status = nvme_check_bounds(ns, slba, nlb);
2491     if (status) {
2492         return status;
2493     }
2494 
2495     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2496         status = nvme_check_dulbe(ns, slba, nlb);
2497         if (status) {
2498             return status;
2499         }
2500     }
2501 
2502     ctx = g_new0(NvmeBounceContext, 1);
2503     ctx->req = req;
2504 
2505     ctx->data.bounce = g_malloc(len);
2506 
2507     qemu_iovec_init(&ctx->data.iov, 1);
2508     qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2509 
2510     block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2511                      BLOCK_ACCT_READ);
2512 
2513     req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2514                                 nvme_verify_mdata_in_cb, ctx);
2515     return NVME_NO_COMPLETE;
2516 }
2517 
2518 typedef struct NvmeCopyAIOCB {
2519     BlockAIOCB common;
2520     BlockAIOCB *aiocb;
2521     NvmeRequest *req;
2522     QEMUBH *bh;
2523     int ret;
2524 
2525     NvmeCopySourceRange *ranges;
2526     int nr;
2527     int idx;
2528 
2529     uint8_t *bounce;
2530     QEMUIOVector iov;
2531     struct {
2532         BlockAcctCookie read;
2533         BlockAcctCookie write;
2534     } acct;
2535 
2536     uint32_t reftag;
2537     uint64_t slba;
2538 
2539     NvmeZone *zone;
2540 } NvmeCopyAIOCB;
2541 
2542 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2543 {
2544     NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2545 
2546     iocb->ret = -ECANCELED;
2547 
2548     if (iocb->aiocb) {
2549         blk_aio_cancel_async(iocb->aiocb);
2550         iocb->aiocb = NULL;
2551     }
2552 }
2553 
2554 static const AIOCBInfo nvme_copy_aiocb_info = {
2555     .aiocb_size   = sizeof(NvmeCopyAIOCB),
2556     .cancel_async = nvme_copy_cancel,
2557 };
2558 
2559 static void nvme_copy_bh(void *opaque)
2560 {
2561     NvmeCopyAIOCB *iocb = opaque;
2562     NvmeRequest *req = iocb->req;
2563     NvmeNamespace *ns = req->ns;
2564     BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2565 
2566     if (iocb->idx != iocb->nr) {
2567         req->cqe.result = cpu_to_le32(iocb->idx);
2568     }
2569 
2570     qemu_iovec_destroy(&iocb->iov);
2571     g_free(iocb->bounce);
2572 
2573     qemu_bh_delete(iocb->bh);
2574     iocb->bh = NULL;
2575 
2576     if (iocb->ret < 0) {
2577         block_acct_failed(stats, &iocb->acct.read);
2578         block_acct_failed(stats, &iocb->acct.write);
2579     } else {
2580         block_acct_done(stats, &iocb->acct.read);
2581         block_acct_done(stats, &iocb->acct.write);
2582     }
2583 
2584     iocb->common.cb(iocb->common.opaque, iocb->ret);
2585     qemu_aio_unref(iocb);
2586 }
2587 
2588 static void nvme_copy_cb(void *opaque, int ret);
2589 
2590 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2591 {
2592     NvmeCopyAIOCB *iocb = opaque;
2593     NvmeRequest *req = iocb->req;
2594     NvmeNamespace *ns = req->ns;
2595     NvmeCopySourceRange *range = &iocb->ranges[iocb->idx];
2596     uint32_t nlb = le32_to_cpu(range->nlb) + 1;
2597 
2598     if (ret < 0) {
2599         iocb->ret = ret;
2600         goto out;
2601     } else if (iocb->ret < 0) {
2602         goto out;
2603     }
2604 
2605     if (ns->params.zoned) {
2606         nvme_advance_zone_wp(ns, iocb->zone, nlb);
2607     }
2608 
2609     iocb->idx++;
2610     iocb->slba += nlb;
2611 out:
2612     nvme_copy_cb(iocb, iocb->ret);
2613 }
2614 
2615 static void nvme_copy_out_cb(void *opaque, int ret)
2616 {
2617     NvmeCopyAIOCB *iocb = opaque;
2618     NvmeRequest *req = iocb->req;
2619     NvmeNamespace *ns = req->ns;
2620     NvmeCopySourceRange *range;
2621     uint32_t nlb;
2622     size_t mlen;
2623     uint8_t *mbounce;
2624 
2625     if (ret < 0) {
2626         iocb->ret = ret;
2627         goto out;
2628     } else if (iocb->ret < 0) {
2629         goto out;
2630     }
2631 
2632     if (!ns->lbaf.ms) {
2633         nvme_copy_out_completed_cb(iocb, 0);
2634         return;
2635     }
2636 
2637     range = &iocb->ranges[iocb->idx];
2638     nlb = le32_to_cpu(range->nlb) + 1;
2639 
2640     mlen = nvme_m2b(ns, nlb);
2641     mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2642 
2643     qemu_iovec_reset(&iocb->iov);
2644     qemu_iovec_add(&iocb->iov, mbounce, mlen);
2645 
2646     iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
2647                                   &iocb->iov, 0, nvme_copy_out_completed_cb,
2648                                   iocb);
2649 
2650     return;
2651 
2652 out:
2653     nvme_copy_cb(iocb, ret);
2654 }
2655 
2656 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2657 {
2658     NvmeCopyAIOCB *iocb = opaque;
2659     NvmeRequest *req = iocb->req;
2660     NvmeNamespace *ns = req->ns;
2661     NvmeCopySourceRange *range;
2662     uint32_t nlb;
2663     size_t len;
2664     uint16_t status;
2665 
2666     if (ret < 0) {
2667         iocb->ret = ret;
2668         goto out;
2669     } else if (iocb->ret < 0) {
2670         goto out;
2671     }
2672 
2673     range = &iocb->ranges[iocb->idx];
2674     nlb = le32_to_cpu(range->nlb) + 1;
2675     len = nvme_l2b(ns, nlb);
2676 
2677     trace_pci_nvme_copy_out(iocb->slba, nlb);
2678 
2679     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2680         NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2681 
2682         uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2683         uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2684 
2685         uint16_t apptag = le16_to_cpu(range->apptag);
2686         uint16_t appmask = le16_to_cpu(range->appmask);
2687         uint32_t reftag = le32_to_cpu(range->reftag);
2688 
2689         uint64_t slba = le64_to_cpu(range->slba);
2690         size_t mlen = nvme_m2b(ns, nlb);
2691         uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2692 
2693         status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
2694                                 slba, apptag, appmask, &reftag);
2695         if (status) {
2696             goto invalid;
2697         }
2698 
2699         apptag = le16_to_cpu(copy->apptag);
2700         appmask = le16_to_cpu(copy->appmask);
2701 
2702         if (prinfow & NVME_PRINFO_PRACT) {
2703             status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
2704             if (status) {
2705                 goto invalid;
2706             }
2707 
2708             nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
2709                                         apptag, &iocb->reftag);
2710         } else {
2711             status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
2712                                     prinfow, iocb->slba, apptag, appmask,
2713                                     &iocb->reftag);
2714             if (status) {
2715                 goto invalid;
2716             }
2717         }
2718     }
2719 
2720     status = nvme_check_bounds(ns, iocb->slba, nlb);
2721     if (status) {
2722         goto invalid;
2723     }
2724 
2725     if (ns->params.zoned) {
2726         status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
2727         if (status) {
2728             goto invalid;
2729         }
2730 
2731         if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) {
2732             iocb->zone->w_ptr += nlb;
2733         }
2734     }
2735 
2736     qemu_iovec_reset(&iocb->iov);
2737     qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2738 
2739     iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
2740                                   &iocb->iov, 0, nvme_copy_out_cb, iocb);
2741 
2742     return;
2743 
2744 invalid:
2745     req->status = status;
2746     iocb->aiocb = NULL;
2747     if (iocb->bh) {
2748         qemu_bh_schedule(iocb->bh);
2749     }
2750 
2751     return;
2752 
2753 out:
2754     nvme_copy_cb(iocb, ret);
2755 }
2756 
2757 static void nvme_copy_in_cb(void *opaque, int ret)
2758 {
2759     NvmeCopyAIOCB *iocb = opaque;
2760     NvmeRequest *req = iocb->req;
2761     NvmeNamespace *ns = req->ns;
2762     NvmeCopySourceRange *range;
2763     uint64_t slba;
2764     uint32_t nlb;
2765 
2766     if (ret < 0) {
2767         iocb->ret = ret;
2768         goto out;
2769     } else if (iocb->ret < 0) {
2770         goto out;
2771     }
2772 
2773     if (!ns->lbaf.ms) {
2774         nvme_copy_in_completed_cb(iocb, 0);
2775         return;
2776     }
2777 
2778     range = &iocb->ranges[iocb->idx];
2779     slba = le64_to_cpu(range->slba);
2780     nlb = le32_to_cpu(range->nlb) + 1;
2781 
2782     qemu_iovec_reset(&iocb->iov);
2783     qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
2784                    nvme_m2b(ns, nlb));
2785 
2786     iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
2787                                  &iocb->iov, 0, nvme_copy_in_completed_cb,
2788                                  iocb);
2789     return;
2790 
2791 out:
2792     nvme_copy_cb(iocb, iocb->ret);
2793 }
2794 
2795 static void nvme_copy_cb(void *opaque, int ret)
2796 {
2797     NvmeCopyAIOCB *iocb = opaque;
2798     NvmeRequest *req = iocb->req;
2799     NvmeNamespace *ns = req->ns;
2800     NvmeCopySourceRange *range;
2801     uint64_t slba;
2802     uint32_t nlb;
2803     size_t len;
2804     uint16_t status;
2805 
2806     if (ret < 0) {
2807         iocb->ret = ret;
2808         goto done;
2809     } else if (iocb->ret < 0) {
2810         goto done;
2811     }
2812 
2813     if (iocb->idx == iocb->nr) {
2814         goto done;
2815     }
2816 
2817     range = &iocb->ranges[iocb->idx];
2818     slba = le64_to_cpu(range->slba);
2819     nlb = le32_to_cpu(range->nlb) + 1;
2820     len = nvme_l2b(ns, nlb);
2821 
2822     trace_pci_nvme_copy_source_range(slba, nlb);
2823 
2824     if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2825         status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2826         goto invalid;
2827     }
2828 
2829     status = nvme_check_bounds(ns, slba, nlb);
2830     if (status) {
2831         goto invalid;
2832     }
2833 
2834     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2835         status = nvme_check_dulbe(ns, slba, nlb);
2836         if (status) {
2837             goto invalid;
2838         }
2839     }
2840 
2841     if (ns->params.zoned) {
2842         status = nvme_check_zone_read(ns, slba, nlb);
2843         if (status) {
2844             goto invalid;
2845         }
2846     }
2847 
2848     qemu_iovec_reset(&iocb->iov);
2849     qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2850 
2851     iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
2852                                  &iocb->iov, 0, nvme_copy_in_cb, iocb);
2853     return;
2854 
2855 invalid:
2856     req->status = status;
2857 done:
2858     iocb->aiocb = NULL;
2859     if (iocb->bh) {
2860         qemu_bh_schedule(iocb->bh);
2861     }
2862 }
2863 
2864 
2865 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2866 {
2867     NvmeNamespace *ns = req->ns;
2868     NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2869     NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
2870                                       nvme_misc_cb, req);
2871     uint16_t nr = copy->nr + 1;
2872     uint8_t format = copy->control[0] & 0xf;
2873     uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2874     uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2875 
2876     uint16_t status;
2877 
2878     trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2879 
2880     iocb->ranges = NULL;
2881     iocb->zone = NULL;
2882 
2883     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2884         ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
2885         status = NVME_INVALID_FIELD | NVME_DNR;
2886         goto invalid;
2887     }
2888 
2889     if (!(n->id_ctrl.ocfs & (1 << format))) {
2890         trace_pci_nvme_err_copy_invalid_format(format);
2891         status = NVME_INVALID_FIELD | NVME_DNR;
2892         goto invalid;
2893     }
2894 
2895     if (nr > ns->id_ns.msrc + 1) {
2896         status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2897         goto invalid;
2898     }
2899 
2900     iocb->ranges = g_new(NvmeCopySourceRange, nr);
2901 
2902     status = nvme_h2c(n, (uint8_t *)iocb->ranges,
2903                       sizeof(NvmeCopySourceRange) * nr, req);
2904     if (status) {
2905         goto invalid;
2906     }
2907 
2908     iocb->slba = le64_to_cpu(copy->sdlba);
2909 
2910     if (ns->params.zoned) {
2911         iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
2912         if (!iocb->zone) {
2913             status = NVME_LBA_RANGE | NVME_DNR;
2914             goto invalid;
2915         }
2916 
2917         status = nvme_zrm_auto(n, ns, iocb->zone);
2918         if (status) {
2919             goto invalid;
2920         }
2921     }
2922 
2923     iocb->req = req;
2924     iocb->bh = qemu_bh_new(nvme_copy_bh, iocb);
2925     iocb->ret = 0;
2926     iocb->nr = nr;
2927     iocb->idx = 0;
2928     iocb->reftag = le32_to_cpu(copy->reftag);
2929     iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
2930                               ns->lbasz + ns->lbaf.ms);
2931 
2932     qemu_iovec_init(&iocb->iov, 1);
2933 
2934     block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
2935                      BLOCK_ACCT_READ);
2936     block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
2937                      BLOCK_ACCT_WRITE);
2938 
2939     req->aiocb = &iocb->common;
2940     nvme_copy_cb(iocb, 0);
2941 
2942     return NVME_NO_COMPLETE;
2943 
2944 invalid:
2945     g_free(iocb->ranges);
2946     qemu_aio_unref(iocb);
2947     return status;
2948 }
2949 
2950 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
2951 {
2952     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2953     NvmeNamespace *ns = req->ns;
2954     BlockBackend *blk = ns->blkconf.blk;
2955     uint64_t slba = le64_to_cpu(rw->slba);
2956     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2957     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2958     size_t data_len = nvme_l2b(ns, nlb);
2959     size_t len = data_len;
2960     int64_t offset = nvme_l2b(ns, slba);
2961     struct nvme_compare_ctx *ctx = NULL;
2962     uint16_t status;
2963 
2964     trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2965 
2966     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
2967         return NVME_INVALID_PROT_INFO | NVME_DNR;
2968     }
2969 
2970     if (nvme_ns_ext(ns)) {
2971         len += nvme_m2b(ns, nlb);
2972     }
2973 
2974     status = nvme_check_mdts(n, len);
2975     if (status) {
2976         return status;
2977     }
2978 
2979     status = nvme_check_bounds(ns, slba, nlb);
2980     if (status) {
2981         return status;
2982     }
2983 
2984     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2985         status = nvme_check_dulbe(ns, slba, nlb);
2986         if (status) {
2987             return status;
2988         }
2989     }
2990 
2991     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
2992     if (status) {
2993         return status;
2994     }
2995 
2996     ctx = g_new(struct nvme_compare_ctx, 1);
2997     ctx->data.bounce = g_malloc(data_len);
2998 
2999     req->opaque = ctx;
3000 
3001     qemu_iovec_init(&ctx->data.iov, 1);
3002     qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
3003 
3004     block_acct_start(blk_get_stats(blk), &req->acct, data_len,
3005                      BLOCK_ACCT_READ);
3006     req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
3007                                 nvme_compare_data_cb, req);
3008 
3009     return NVME_NO_COMPLETE;
3010 }
3011 
3012 typedef struct NvmeFlushAIOCB {
3013     BlockAIOCB common;
3014     BlockAIOCB *aiocb;
3015     NvmeRequest *req;
3016     QEMUBH *bh;
3017     int ret;
3018 
3019     NvmeNamespace *ns;
3020     uint32_t nsid;
3021     bool broadcast;
3022 } NvmeFlushAIOCB;
3023 
3024 static void nvme_flush_cancel(BlockAIOCB *acb)
3025 {
3026     NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
3027 
3028     iocb->ret = -ECANCELED;
3029 
3030     if (iocb->aiocb) {
3031         blk_aio_cancel_async(iocb->aiocb);
3032     }
3033 }
3034 
3035 static const AIOCBInfo nvme_flush_aiocb_info = {
3036     .aiocb_size = sizeof(NvmeFlushAIOCB),
3037     .cancel_async = nvme_flush_cancel,
3038     .get_aio_context = nvme_get_aio_context,
3039 };
3040 
3041 static void nvme_flush_ns_cb(void *opaque, int ret)
3042 {
3043     NvmeFlushAIOCB *iocb = opaque;
3044     NvmeNamespace *ns = iocb->ns;
3045 
3046     if (ret < 0) {
3047         iocb->ret = ret;
3048         goto out;
3049     } else if (iocb->ret < 0) {
3050         goto out;
3051     }
3052 
3053     if (ns) {
3054         trace_pci_nvme_flush_ns(iocb->nsid);
3055 
3056         iocb->ns = NULL;
3057         iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
3058         return;
3059     }
3060 
3061 out:
3062     iocb->aiocb = NULL;
3063     qemu_bh_schedule(iocb->bh);
3064 }
3065 
3066 static void nvme_flush_bh(void *opaque)
3067 {
3068     NvmeFlushAIOCB *iocb = opaque;
3069     NvmeRequest *req = iocb->req;
3070     NvmeCtrl *n = nvme_ctrl(req);
3071     int i;
3072 
3073     if (iocb->ret < 0) {
3074         goto done;
3075     }
3076 
3077     if (iocb->broadcast) {
3078         for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
3079             iocb->ns = nvme_ns(n, i);
3080             if (iocb->ns) {
3081                 iocb->nsid = i;
3082                 break;
3083             }
3084         }
3085     }
3086 
3087     if (!iocb->ns) {
3088         goto done;
3089     }
3090 
3091     nvme_flush_ns_cb(iocb, 0);
3092     return;
3093 
3094 done:
3095     qemu_bh_delete(iocb->bh);
3096     iocb->bh = NULL;
3097 
3098     iocb->common.cb(iocb->common.opaque, iocb->ret);
3099 
3100     qemu_aio_unref(iocb);
3101 
3102     return;
3103 }
3104 
3105 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3106 {
3107     NvmeFlushAIOCB *iocb;
3108     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3109     uint16_t status;
3110 
3111     iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3112 
3113     iocb->req = req;
3114     iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
3115     iocb->ret = 0;
3116     iocb->ns = NULL;
3117     iocb->nsid = 0;
3118     iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3119 
3120     if (!iocb->broadcast) {
3121         if (!nvme_nsid_valid(n, nsid)) {
3122             status = NVME_INVALID_NSID | NVME_DNR;
3123             goto out;
3124         }
3125 
3126         iocb->ns = nvme_ns(n, nsid);
3127         if (!iocb->ns) {
3128             status = NVME_INVALID_FIELD | NVME_DNR;
3129             goto out;
3130         }
3131 
3132         iocb->nsid = nsid;
3133     }
3134 
3135     req->aiocb = &iocb->common;
3136     qemu_bh_schedule(iocb->bh);
3137 
3138     return NVME_NO_COMPLETE;
3139 
3140 out:
3141     qemu_bh_delete(iocb->bh);
3142     iocb->bh = NULL;
3143     qemu_aio_unref(iocb);
3144 
3145     return status;
3146 }
3147 
3148 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3149 {
3150     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3151     NvmeNamespace *ns = req->ns;
3152     uint64_t slba = le64_to_cpu(rw->slba);
3153     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3154     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3155     uint64_t data_size = nvme_l2b(ns, nlb);
3156     uint64_t mapped_size = data_size;
3157     uint64_t data_offset;
3158     BlockBackend *blk = ns->blkconf.blk;
3159     uint16_t status;
3160 
3161     if (nvme_ns_ext(ns)) {
3162         mapped_size += nvme_m2b(ns, nlb);
3163 
3164         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3165             bool pract = prinfo & NVME_PRINFO_PRACT;
3166 
3167             if (pract && ns->lbaf.ms == 8) {
3168                 mapped_size = data_size;
3169             }
3170         }
3171     }
3172 
3173     trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3174 
3175     status = nvme_check_mdts(n, mapped_size);
3176     if (status) {
3177         goto invalid;
3178     }
3179 
3180     status = nvme_check_bounds(ns, slba, nlb);
3181     if (status) {
3182         goto invalid;
3183     }
3184 
3185     if (ns->params.zoned) {
3186         status = nvme_check_zone_read(ns, slba, nlb);
3187         if (status) {
3188             trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3189             goto invalid;
3190         }
3191     }
3192 
3193     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3194         status = nvme_check_dulbe(ns, slba, nlb);
3195         if (status) {
3196             goto invalid;
3197         }
3198     }
3199 
3200     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3201         return nvme_dif_rw(n, req);
3202     }
3203 
3204     status = nvme_map_data(n, nlb, req);
3205     if (status) {
3206         goto invalid;
3207     }
3208 
3209     data_offset = nvme_l2b(ns, slba);
3210 
3211     block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3212                      BLOCK_ACCT_READ);
3213     nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
3214     return NVME_NO_COMPLETE;
3215 
3216 invalid:
3217     block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3218     return status | NVME_DNR;
3219 }
3220 
3221 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3222                               bool wrz)
3223 {
3224     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3225     NvmeNamespace *ns = req->ns;
3226     uint64_t slba = le64_to_cpu(rw->slba);
3227     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3228     uint16_t ctrl = le16_to_cpu(rw->control);
3229     uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3230     uint64_t data_size = nvme_l2b(ns, nlb);
3231     uint64_t mapped_size = data_size;
3232     uint64_t data_offset;
3233     NvmeZone *zone;
3234     NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3235     BlockBackend *blk = ns->blkconf.blk;
3236     uint16_t status;
3237 
3238     if (nvme_ns_ext(ns)) {
3239         mapped_size += nvme_m2b(ns, nlb);
3240 
3241         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3242             bool pract = prinfo & NVME_PRINFO_PRACT;
3243 
3244             if (pract && ns->lbaf.ms == 8) {
3245                 mapped_size -= nvme_m2b(ns, nlb);
3246             }
3247         }
3248     }
3249 
3250     trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3251                          nvme_nsid(ns), nlb, mapped_size, slba);
3252 
3253     if (!wrz) {
3254         status = nvme_check_mdts(n, mapped_size);
3255         if (status) {
3256             goto invalid;
3257         }
3258     }
3259 
3260     status = nvme_check_bounds(ns, slba, nlb);
3261     if (status) {
3262         goto invalid;
3263     }
3264 
3265     if (ns->params.zoned) {
3266         zone = nvme_get_zone_by_slba(ns, slba);
3267         assert(zone);
3268 
3269         if (append) {
3270             bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3271 
3272             if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3273                 return NVME_INVALID_ZONE_OP | NVME_DNR;
3274             }
3275 
3276             if (unlikely(slba != zone->d.zslba)) {
3277                 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3278                 status = NVME_INVALID_FIELD;
3279                 goto invalid;
3280             }
3281 
3282             if (n->params.zasl &&
3283                 data_size > (uint64_t)n->page_size << n->params.zasl) {
3284                 trace_pci_nvme_err_zasl(data_size);
3285                 return NVME_INVALID_FIELD | NVME_DNR;
3286             }
3287 
3288             slba = zone->w_ptr;
3289             rw->slba = cpu_to_le64(slba);
3290             res->slba = cpu_to_le64(slba);
3291 
3292             switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3293             case NVME_ID_NS_DPS_TYPE_1:
3294                 if (!piremap) {
3295                     return NVME_INVALID_PROT_INFO | NVME_DNR;
3296                 }
3297 
3298                 /* fallthrough */
3299 
3300             case NVME_ID_NS_DPS_TYPE_2:
3301                 if (piremap) {
3302                     uint32_t reftag = le32_to_cpu(rw->reftag);
3303                     rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3304                 }
3305 
3306                 break;
3307 
3308             case NVME_ID_NS_DPS_TYPE_3:
3309                 if (piremap) {
3310                     return NVME_INVALID_PROT_INFO | NVME_DNR;
3311                 }
3312 
3313                 break;
3314             }
3315         }
3316 
3317         status = nvme_check_zone_write(ns, zone, slba, nlb);
3318         if (status) {
3319             goto invalid;
3320         }
3321 
3322         status = nvme_zrm_auto(n, ns, zone);
3323         if (status) {
3324             goto invalid;
3325         }
3326 
3327         if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3328             zone->w_ptr += nlb;
3329         }
3330     }
3331 
3332     data_offset = nvme_l2b(ns, slba);
3333 
3334     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3335         return nvme_dif_rw(n, req);
3336     }
3337 
3338     if (!wrz) {
3339         status = nvme_map_data(n, nlb, req);
3340         if (status) {
3341             goto invalid;
3342         }
3343 
3344         block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3345                          BLOCK_ACCT_WRITE);
3346         nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3347     } else {
3348         req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3349                                            BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3350                                            req);
3351     }
3352 
3353     return NVME_NO_COMPLETE;
3354 
3355 invalid:
3356     block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3357     return status | NVME_DNR;
3358 }
3359 
3360 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3361 {
3362     return nvme_do_write(n, req, false, false);
3363 }
3364 
3365 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3366 {
3367     return nvme_do_write(n, req, false, true);
3368 }
3369 
3370 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3371 {
3372     return nvme_do_write(n, req, true, false);
3373 }
3374 
3375 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3376                                             uint64_t *slba, uint32_t *zone_idx)
3377 {
3378     uint32_t dw10 = le32_to_cpu(c->cdw10);
3379     uint32_t dw11 = le32_to_cpu(c->cdw11);
3380 
3381     if (!ns->params.zoned) {
3382         trace_pci_nvme_err_invalid_opc(c->opcode);
3383         return NVME_INVALID_OPCODE | NVME_DNR;
3384     }
3385 
3386     *slba = ((uint64_t)dw11) << 32 | dw10;
3387     if (unlikely(*slba >= ns->id_ns.nsze)) {
3388         trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3389         *slba = 0;
3390         return NVME_LBA_RANGE | NVME_DNR;
3391     }
3392 
3393     *zone_idx = nvme_zone_idx(ns, *slba);
3394     assert(*zone_idx < ns->num_zones);
3395 
3396     return NVME_SUCCESS;
3397 }
3398 
3399 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3400                                  NvmeRequest *);
3401 
3402 enum NvmeZoneProcessingMask {
3403     NVME_PROC_CURRENT_ZONE    = 0,
3404     NVME_PROC_OPENED_ZONES    = 1 << 0,
3405     NVME_PROC_CLOSED_ZONES    = 1 << 1,
3406     NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3407     NVME_PROC_FULL_ZONES      = 1 << 3,
3408 };
3409 
3410 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3411                                NvmeZoneState state, NvmeRequest *req)
3412 {
3413     NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3414     int flags = 0;
3415 
3416     if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) {
3417         uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3418 
3419         if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3420             return NVME_INVALID_ZONE_OP | NVME_DNR;
3421         }
3422 
3423         if (zone->w_ptr % ns->zns.zrwafg) {
3424             return NVME_NOZRWA | NVME_DNR;
3425         }
3426 
3427         flags = NVME_ZRM_ZRWA;
3428     }
3429 
3430     return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags);
3431 }
3432 
3433 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3434                                 NvmeZoneState state, NvmeRequest *req)
3435 {
3436     return nvme_zrm_close(ns, zone);
3437 }
3438 
3439 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3440                                  NvmeZoneState state, NvmeRequest *req)
3441 {
3442     return nvme_zrm_finish(ns, zone);
3443 }
3444 
3445 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3446                                   NvmeZoneState state, NvmeRequest *req)
3447 {
3448     switch (state) {
3449     case NVME_ZONE_STATE_READ_ONLY:
3450         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3451         /* fall through */
3452     case NVME_ZONE_STATE_OFFLINE:
3453         return NVME_SUCCESS;
3454     default:
3455         return NVME_ZONE_INVAL_TRANSITION;
3456     }
3457 }
3458 
3459 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3460 {
3461     uint16_t status;
3462     uint8_t state = nvme_get_zone_state(zone);
3463 
3464     if (state == NVME_ZONE_STATE_EMPTY) {
3465         status = nvme_aor_check(ns, 1, 0);
3466         if (status) {
3467             return status;
3468         }
3469         nvme_aor_inc_active(ns);
3470         zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3471         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3472         return NVME_SUCCESS;
3473     }
3474 
3475     return NVME_ZONE_INVAL_TRANSITION;
3476 }
3477 
3478 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3479                                     enum NvmeZoneProcessingMask proc_mask,
3480                                     op_handler_t op_hndlr, NvmeRequest *req)
3481 {
3482     uint16_t status = NVME_SUCCESS;
3483     NvmeZoneState zs = nvme_get_zone_state(zone);
3484     bool proc_zone;
3485 
3486     switch (zs) {
3487     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3488     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3489         proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3490         break;
3491     case NVME_ZONE_STATE_CLOSED:
3492         proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3493         break;
3494     case NVME_ZONE_STATE_READ_ONLY:
3495         proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3496         break;
3497     case NVME_ZONE_STATE_FULL:
3498         proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3499         break;
3500     default:
3501         proc_zone = false;
3502     }
3503 
3504     if (proc_zone) {
3505         status = op_hndlr(ns, zone, zs, req);
3506     }
3507 
3508     return status;
3509 }
3510 
3511 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3512                                 enum NvmeZoneProcessingMask proc_mask,
3513                                 op_handler_t op_hndlr, NvmeRequest *req)
3514 {
3515     NvmeZone *next;
3516     uint16_t status = NVME_SUCCESS;
3517     int i;
3518 
3519     if (!proc_mask) {
3520         status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3521     } else {
3522         if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3523             QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3524                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3525                                              req);
3526                 if (status && status != NVME_NO_COMPLETE) {
3527                     goto out;
3528                 }
3529             }
3530         }
3531         if (proc_mask & NVME_PROC_OPENED_ZONES) {
3532             QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3533                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3534                                              req);
3535                 if (status && status != NVME_NO_COMPLETE) {
3536                     goto out;
3537                 }
3538             }
3539 
3540             QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3541                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3542                                              req);
3543                 if (status && status != NVME_NO_COMPLETE) {
3544                     goto out;
3545                 }
3546             }
3547         }
3548         if (proc_mask & NVME_PROC_FULL_ZONES) {
3549             QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3550                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3551                                              req);
3552                 if (status && status != NVME_NO_COMPLETE) {
3553                     goto out;
3554                 }
3555             }
3556         }
3557 
3558         if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3559             for (i = 0; i < ns->num_zones; i++, zone++) {
3560                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3561                                              req);
3562                 if (status && status != NVME_NO_COMPLETE) {
3563                     goto out;
3564                 }
3565             }
3566         }
3567     }
3568 
3569 out:
3570     return status;
3571 }
3572 
3573 typedef struct NvmeZoneResetAIOCB {
3574     BlockAIOCB common;
3575     BlockAIOCB *aiocb;
3576     NvmeRequest *req;
3577     QEMUBH *bh;
3578     int ret;
3579 
3580     bool all;
3581     int idx;
3582     NvmeZone *zone;
3583 } NvmeZoneResetAIOCB;
3584 
3585 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
3586 {
3587     NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
3588     NvmeRequest *req = iocb->req;
3589     NvmeNamespace *ns = req->ns;
3590 
3591     iocb->idx = ns->num_zones;
3592 
3593     iocb->ret = -ECANCELED;
3594 
3595     if (iocb->aiocb) {
3596         blk_aio_cancel_async(iocb->aiocb);
3597         iocb->aiocb = NULL;
3598     }
3599 }
3600 
3601 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
3602     .aiocb_size = sizeof(NvmeZoneResetAIOCB),
3603     .cancel_async = nvme_zone_reset_cancel,
3604 };
3605 
3606 static void nvme_zone_reset_bh(void *opaque)
3607 {
3608     NvmeZoneResetAIOCB *iocb = opaque;
3609 
3610     iocb->common.cb(iocb->common.opaque, iocb->ret);
3611 
3612     qemu_bh_delete(iocb->bh);
3613     iocb->bh = NULL;
3614     qemu_aio_unref(iocb);
3615 }
3616 
3617 static void nvme_zone_reset_cb(void *opaque, int ret);
3618 
3619 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
3620 {
3621     NvmeZoneResetAIOCB *iocb = opaque;
3622     NvmeRequest *req = iocb->req;
3623     NvmeNamespace *ns = req->ns;
3624     int64_t moff;
3625     int count;
3626 
3627     if (ret < 0) {
3628         nvme_zone_reset_cb(iocb, ret);
3629         return;
3630     }
3631 
3632     if (!ns->lbaf.ms) {
3633         nvme_zone_reset_cb(iocb, 0);
3634         return;
3635     }
3636 
3637     moff = nvme_moff(ns, iocb->zone->d.zslba);
3638     count = nvme_m2b(ns, ns->zone_size);
3639 
3640     iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
3641                                         BDRV_REQ_MAY_UNMAP,
3642                                         nvme_zone_reset_cb, iocb);
3643     return;
3644 }
3645 
3646 static void nvme_zone_reset_cb(void *opaque, int ret)
3647 {
3648     NvmeZoneResetAIOCB *iocb = opaque;
3649     NvmeRequest *req = iocb->req;
3650     NvmeNamespace *ns = req->ns;
3651 
3652     if (ret < 0) {
3653         iocb->ret = ret;
3654         goto done;
3655     }
3656 
3657     if (iocb->zone) {
3658         nvme_zrm_reset(ns, iocb->zone);
3659 
3660         if (!iocb->all) {
3661             goto done;
3662         }
3663     }
3664 
3665     while (iocb->idx < ns->num_zones) {
3666         NvmeZone *zone = &ns->zone_array[iocb->idx++];
3667 
3668         switch (nvme_get_zone_state(zone)) {
3669         case NVME_ZONE_STATE_EMPTY:
3670             if (!iocb->all) {
3671                 goto done;
3672             }
3673 
3674             continue;
3675 
3676         case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3677         case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3678         case NVME_ZONE_STATE_CLOSED:
3679         case NVME_ZONE_STATE_FULL:
3680             iocb->zone = zone;
3681             break;
3682 
3683         default:
3684             continue;
3685         }
3686 
3687         trace_pci_nvme_zns_zone_reset(zone->d.zslba);
3688 
3689         iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
3690                                             nvme_l2b(ns, zone->d.zslba),
3691                                             nvme_l2b(ns, ns->zone_size),
3692                                             BDRV_REQ_MAY_UNMAP,
3693                                             nvme_zone_reset_epilogue_cb,
3694                                             iocb);
3695         return;
3696     }
3697 
3698 done:
3699     iocb->aiocb = NULL;
3700     if (iocb->bh) {
3701         qemu_bh_schedule(iocb->bh);
3702     }
3703 }
3704 
3705 static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone,
3706                                                uint64_t elba, NvmeRequest *req)
3707 {
3708     NvmeNamespace *ns = req->ns;
3709     uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3710     uint64_t wp = zone->d.wp;
3711     uint32_t nlb = elba - wp + 1;
3712     uint16_t status;
3713 
3714 
3715     if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3716         return NVME_INVALID_ZONE_OP | NVME_DNR;
3717     }
3718 
3719     if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3720         return NVME_INVALID_FIELD | NVME_DNR;
3721     }
3722 
3723     if (elba < wp || elba > wp + ns->zns.zrwas) {
3724         return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
3725     }
3726 
3727     if (nlb % ns->zns.zrwafg) {
3728         return NVME_INVALID_FIELD | NVME_DNR;
3729     }
3730 
3731     status = nvme_zrm_auto(n, ns, zone);
3732     if (status) {
3733         return status;
3734     }
3735 
3736     zone->w_ptr += nlb;
3737 
3738     nvme_advance_zone_wp(ns, zone, nlb);
3739 
3740     return NVME_SUCCESS;
3741 }
3742 
3743 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3744 {
3745     NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3746     NvmeNamespace *ns = req->ns;
3747     NvmeZone *zone;
3748     NvmeZoneResetAIOCB *iocb;
3749     uint8_t *zd_ext;
3750     uint64_t slba = 0;
3751     uint32_t zone_idx = 0;
3752     uint16_t status;
3753     uint8_t action = cmd->zsa;
3754     bool all;
3755     enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3756 
3757     all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
3758 
3759     req->status = NVME_SUCCESS;
3760 
3761     if (!all) {
3762         status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
3763         if (status) {
3764             return status;
3765         }
3766     }
3767 
3768     zone = &ns->zone_array[zone_idx];
3769     if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) {
3770         trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3771         return NVME_INVALID_FIELD | NVME_DNR;
3772     }
3773 
3774     switch (action) {
3775 
3776     case NVME_ZONE_ACTION_OPEN:
3777         if (all) {
3778             proc_mask = NVME_PROC_CLOSED_ZONES;
3779         }
3780         trace_pci_nvme_open_zone(slba, zone_idx, all);
3781         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3782         break;
3783 
3784     case NVME_ZONE_ACTION_CLOSE:
3785         if (all) {
3786             proc_mask = NVME_PROC_OPENED_ZONES;
3787         }
3788         trace_pci_nvme_close_zone(slba, zone_idx, all);
3789         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3790         break;
3791 
3792     case NVME_ZONE_ACTION_FINISH:
3793         if (all) {
3794             proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3795         }
3796         trace_pci_nvme_finish_zone(slba, zone_idx, all);
3797         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3798         break;
3799 
3800     case NVME_ZONE_ACTION_RESET:
3801         trace_pci_nvme_reset_zone(slba, zone_idx, all);
3802 
3803         iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
3804                            nvme_misc_cb, req);
3805 
3806         iocb->req = req;
3807         iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb);
3808         iocb->ret = 0;
3809         iocb->all = all;
3810         iocb->idx = zone_idx;
3811         iocb->zone = NULL;
3812 
3813         req->aiocb = &iocb->common;
3814         nvme_zone_reset_cb(iocb, 0);
3815 
3816         return NVME_NO_COMPLETE;
3817 
3818     case NVME_ZONE_ACTION_OFFLINE:
3819         if (all) {
3820             proc_mask = NVME_PROC_READ_ONLY_ZONES;
3821         }
3822         trace_pci_nvme_offline_zone(slba, zone_idx, all);
3823         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3824         break;
3825 
3826     case NVME_ZONE_ACTION_SET_ZD_EXT:
3827         trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3828         if (all || !ns->params.zd_extension_size) {
3829             return NVME_INVALID_FIELD | NVME_DNR;
3830         }
3831         zd_ext = nvme_get_zd_extension(ns, zone_idx);
3832         status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3833         if (status) {
3834             trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3835             return status;
3836         }
3837 
3838         status = nvme_set_zd_ext(ns, zone);
3839         if (status == NVME_SUCCESS) {
3840             trace_pci_nvme_zd_extension_set(zone_idx);
3841             return status;
3842         }
3843         break;
3844 
3845     case NVME_ZONE_ACTION_ZRWA_FLUSH:
3846         if (all) {
3847             return NVME_INVALID_FIELD | NVME_DNR;
3848         }
3849 
3850         return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req);
3851 
3852     default:
3853         trace_pci_nvme_err_invalid_mgmt_action(action);
3854         status = NVME_INVALID_FIELD;
3855     }
3856 
3857     if (status == NVME_ZONE_INVAL_TRANSITION) {
3858         trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
3859                                                          zone->d.za);
3860     }
3861     if (status) {
3862         status |= NVME_DNR;
3863     }
3864 
3865     return status;
3866 }
3867 
3868 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
3869 {
3870     NvmeZoneState zs = nvme_get_zone_state(zl);
3871 
3872     switch (zafs) {
3873     case NVME_ZONE_REPORT_ALL:
3874         return true;
3875     case NVME_ZONE_REPORT_EMPTY:
3876         return zs == NVME_ZONE_STATE_EMPTY;
3877     case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
3878         return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
3879     case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
3880         return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
3881     case NVME_ZONE_REPORT_CLOSED:
3882         return zs == NVME_ZONE_STATE_CLOSED;
3883     case NVME_ZONE_REPORT_FULL:
3884         return zs == NVME_ZONE_STATE_FULL;
3885     case NVME_ZONE_REPORT_READ_ONLY:
3886         return zs == NVME_ZONE_STATE_READ_ONLY;
3887     case NVME_ZONE_REPORT_OFFLINE:
3888         return zs == NVME_ZONE_STATE_OFFLINE;
3889     default:
3890         return false;
3891     }
3892 }
3893 
3894 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
3895 {
3896     NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3897     NvmeNamespace *ns = req->ns;
3898     /* cdw12 is zero-based number of dwords to return. Convert to bytes */
3899     uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
3900     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3901     uint32_t zone_idx, zra, zrasf, partial;
3902     uint64_t max_zones, nr_zones = 0;
3903     uint16_t status;
3904     uint64_t slba;
3905     NvmeZoneDescr *z;
3906     NvmeZone *zone;
3907     NvmeZoneReportHeader *header;
3908     void *buf, *buf_p;
3909     size_t zone_entry_sz;
3910     int i;
3911 
3912     req->status = NVME_SUCCESS;
3913 
3914     status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3915     if (status) {
3916         return status;
3917     }
3918 
3919     zra = dw13 & 0xff;
3920     if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
3921         return NVME_INVALID_FIELD | NVME_DNR;
3922     }
3923     if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
3924         return NVME_INVALID_FIELD | NVME_DNR;
3925     }
3926 
3927     zrasf = (dw13 >> 8) & 0xff;
3928     if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
3929         return NVME_INVALID_FIELD | NVME_DNR;
3930     }
3931 
3932     if (data_size < sizeof(NvmeZoneReportHeader)) {
3933         return NVME_INVALID_FIELD | NVME_DNR;
3934     }
3935 
3936     status = nvme_check_mdts(n, data_size);
3937     if (status) {
3938         return status;
3939     }
3940 
3941     partial = (dw13 >> 16) & 0x01;
3942 
3943     zone_entry_sz = sizeof(NvmeZoneDescr);
3944     if (zra == NVME_ZONE_REPORT_EXTENDED) {
3945         zone_entry_sz += ns->params.zd_extension_size;
3946     }
3947 
3948     max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
3949     buf = g_malloc0(data_size);
3950 
3951     zone = &ns->zone_array[zone_idx];
3952     for (i = zone_idx; i < ns->num_zones; i++) {
3953         if (partial && nr_zones >= max_zones) {
3954             break;
3955         }
3956         if (nvme_zone_matches_filter(zrasf, zone++)) {
3957             nr_zones++;
3958         }
3959     }
3960     header = (NvmeZoneReportHeader *)buf;
3961     header->nr_zones = cpu_to_le64(nr_zones);
3962 
3963     buf_p = buf + sizeof(NvmeZoneReportHeader);
3964     for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
3965         zone = &ns->zone_array[zone_idx];
3966         if (nvme_zone_matches_filter(zrasf, zone)) {
3967             z = (NvmeZoneDescr *)buf_p;
3968             buf_p += sizeof(NvmeZoneDescr);
3969 
3970             z->zt = zone->d.zt;
3971             z->zs = zone->d.zs;
3972             z->zcap = cpu_to_le64(zone->d.zcap);
3973             z->zslba = cpu_to_le64(zone->d.zslba);
3974             z->za = zone->d.za;
3975 
3976             if (nvme_wp_is_valid(zone)) {
3977                 z->wp = cpu_to_le64(zone->d.wp);
3978             } else {
3979                 z->wp = cpu_to_le64(~0ULL);
3980             }
3981 
3982             if (zra == NVME_ZONE_REPORT_EXTENDED) {
3983                 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
3984                     memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
3985                            ns->params.zd_extension_size);
3986                 }
3987                 buf_p += ns->params.zd_extension_size;
3988             }
3989 
3990             max_zones--;
3991         }
3992     }
3993 
3994     status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
3995 
3996     g_free(buf);
3997 
3998     return status;
3999 }
4000 
4001 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
4002 {
4003     NvmeNamespace *ns;
4004     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4005 
4006     trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
4007                           req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
4008 
4009     if (!nvme_nsid_valid(n, nsid)) {
4010         return NVME_INVALID_NSID | NVME_DNR;
4011     }
4012 
4013     /*
4014      * In the base NVM command set, Flush may apply to all namespaces
4015      * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
4016      * along with TP 4056 (Namespace Types), it may be pretty screwed up.
4017      *
4018      * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
4019      * opcode with a specific command since we cannot determine a unique I/O
4020      * command set. Opcode 0h could have any other meaning than something
4021      * equivalent to flushing and say it DOES have completely different
4022      * semantics in some other command set - does an NSID of FFFFFFFFh then
4023      * mean "for all namespaces, apply whatever command set specific command
4024      * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
4025      * whatever command that uses the 0h opcode if, and only if, it allows NSID
4026      * to be FFFFFFFFh"?
4027      *
4028      * Anyway (and luckily), for now, we do not care about this since the
4029      * device only supports namespace types that includes the NVM Flush command
4030      * (NVM and Zoned), so always do an NVM Flush.
4031      */
4032     if (req->cmd.opcode == NVME_CMD_FLUSH) {
4033         return nvme_flush(n, req);
4034     }
4035 
4036     ns = nvme_ns(n, nsid);
4037     if (unlikely(!ns)) {
4038         return NVME_INVALID_FIELD | NVME_DNR;
4039     }
4040 
4041     if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4042         trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4043         return NVME_INVALID_OPCODE | NVME_DNR;
4044     }
4045 
4046     if (ns->status) {
4047         return ns->status;
4048     }
4049 
4050     if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
4051         return NVME_INVALID_FIELD;
4052     }
4053 
4054     req->ns = ns;
4055 
4056     switch (req->cmd.opcode) {
4057     case NVME_CMD_WRITE_ZEROES:
4058         return nvme_write_zeroes(n, req);
4059     case NVME_CMD_ZONE_APPEND:
4060         return nvme_zone_append(n, req);
4061     case NVME_CMD_WRITE:
4062         return nvme_write(n, req);
4063     case NVME_CMD_READ:
4064         return nvme_read(n, req);
4065     case NVME_CMD_COMPARE:
4066         return nvme_compare(n, req);
4067     case NVME_CMD_DSM:
4068         return nvme_dsm(n, req);
4069     case NVME_CMD_VERIFY:
4070         return nvme_verify(n, req);
4071     case NVME_CMD_COPY:
4072         return nvme_copy(n, req);
4073     case NVME_CMD_ZONE_MGMT_SEND:
4074         return nvme_zone_mgmt_send(n, req);
4075     case NVME_CMD_ZONE_MGMT_RECV:
4076         return nvme_zone_mgmt_recv(n, req);
4077     default:
4078         assert(false);
4079     }
4080 
4081     return NVME_INVALID_OPCODE | NVME_DNR;
4082 }
4083 
4084 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
4085 {
4086     n->sq[sq->sqid] = NULL;
4087     timer_free(sq->timer);
4088     g_free(sq->io_req);
4089     if (sq->sqid) {
4090         g_free(sq);
4091     }
4092 }
4093 
4094 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
4095 {
4096     NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4097     NvmeRequest *r, *next;
4098     NvmeSQueue *sq;
4099     NvmeCQueue *cq;
4100     uint16_t qid = le16_to_cpu(c->qid);
4101 
4102     if (unlikely(!qid || nvme_check_sqid(n, qid))) {
4103         trace_pci_nvme_err_invalid_del_sq(qid);
4104         return NVME_INVALID_QID | NVME_DNR;
4105     }
4106 
4107     trace_pci_nvme_del_sq(qid);
4108 
4109     sq = n->sq[qid];
4110     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
4111         r = QTAILQ_FIRST(&sq->out_req_list);
4112         assert(r->aiocb);
4113         blk_aio_cancel(r->aiocb);
4114     }
4115 
4116     assert(QTAILQ_EMPTY(&sq->out_req_list));
4117 
4118     if (!nvme_check_cqid(n, sq->cqid)) {
4119         cq = n->cq[sq->cqid];
4120         QTAILQ_REMOVE(&cq->sq_list, sq, entry);
4121 
4122         nvme_post_cqes(cq);
4123         QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
4124             if (r->sq == sq) {
4125                 QTAILQ_REMOVE(&cq->req_list, r, entry);
4126                 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
4127             }
4128         }
4129     }
4130 
4131     nvme_free_sq(sq, n);
4132     return NVME_SUCCESS;
4133 }
4134 
4135 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
4136                          uint16_t sqid, uint16_t cqid, uint16_t size)
4137 {
4138     int i;
4139     NvmeCQueue *cq;
4140 
4141     sq->ctrl = n;
4142     sq->dma_addr = dma_addr;
4143     sq->sqid = sqid;
4144     sq->size = size;
4145     sq->cqid = cqid;
4146     sq->head = sq->tail = 0;
4147     sq->io_req = g_new0(NvmeRequest, sq->size);
4148 
4149     QTAILQ_INIT(&sq->req_list);
4150     QTAILQ_INIT(&sq->out_req_list);
4151     for (i = 0; i < sq->size; i++) {
4152         sq->io_req[i].sq = sq;
4153         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4154     }
4155     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
4156 
4157     assert(n->cq[cqid]);
4158     cq = n->cq[cqid];
4159     QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4160     n->sq[sqid] = sq;
4161 }
4162 
4163 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4164 {
4165     NvmeSQueue *sq;
4166     NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4167 
4168     uint16_t cqid = le16_to_cpu(c->cqid);
4169     uint16_t sqid = le16_to_cpu(c->sqid);
4170     uint16_t qsize = le16_to_cpu(c->qsize);
4171     uint16_t qflags = le16_to_cpu(c->sq_flags);
4172     uint64_t prp1 = le64_to_cpu(c->prp1);
4173 
4174     trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4175 
4176     if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4177         trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4178         return NVME_INVALID_CQID | NVME_DNR;
4179     }
4180     if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
4181         n->sq[sqid] != NULL)) {
4182         trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4183         return NVME_INVALID_QID | NVME_DNR;
4184     }
4185     if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4186         trace_pci_nvme_err_invalid_create_sq_size(qsize);
4187         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4188     }
4189     if (unlikely(prp1 & (n->page_size - 1))) {
4190         trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4191         return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4192     }
4193     if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4194         trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4195         return NVME_INVALID_FIELD | NVME_DNR;
4196     }
4197     sq = g_malloc0(sizeof(*sq));
4198     nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4199     return NVME_SUCCESS;
4200 }
4201 
4202 struct nvme_stats {
4203     uint64_t units_read;
4204     uint64_t units_written;
4205     uint64_t read_commands;
4206     uint64_t write_commands;
4207 };
4208 
4209 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4210 {
4211     BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4212 
4213     stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
4214     stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
4215     stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4216     stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4217 }
4218 
4219 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4220                                 uint64_t off, NvmeRequest *req)
4221 {
4222     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4223     struct nvme_stats stats = { 0 };
4224     NvmeSmartLog smart = { 0 };
4225     uint32_t trans_len;
4226     NvmeNamespace *ns;
4227     time_t current_ms;
4228 
4229     if (off >= sizeof(smart)) {
4230         return NVME_INVALID_FIELD | NVME_DNR;
4231     }
4232 
4233     if (nsid != 0xffffffff) {
4234         ns = nvme_ns(n, nsid);
4235         if (!ns) {
4236             return NVME_INVALID_NSID | NVME_DNR;
4237         }
4238         nvme_set_blk_stats(ns, &stats);
4239     } else {
4240         int i;
4241 
4242         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4243             ns = nvme_ns(n, i);
4244             if (!ns) {
4245                 continue;
4246             }
4247             nvme_set_blk_stats(ns, &stats);
4248         }
4249     }
4250 
4251     trans_len = MIN(sizeof(smart) - off, buf_len);
4252     smart.critical_warning = n->smart_critical_warning;
4253 
4254     smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
4255                                                         1000));
4256     smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
4257                                                            1000));
4258     smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4259     smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4260 
4261     smart.temperature = cpu_to_le16(n->temperature);
4262 
4263     if ((n->temperature >= n->features.temp_thresh_hi) ||
4264         (n->temperature <= n->features.temp_thresh_low)) {
4265         smart.critical_warning |= NVME_SMART_TEMPERATURE;
4266     }
4267 
4268     current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4269     smart.power_on_hours[0] =
4270         cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4271 
4272     if (!rae) {
4273         nvme_clear_events(n, NVME_AER_TYPE_SMART);
4274     }
4275 
4276     return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4277 }
4278 
4279 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4280                                  NvmeRequest *req)
4281 {
4282     uint32_t trans_len;
4283     NvmeFwSlotInfoLog fw_log = {
4284         .afi = 0x1,
4285     };
4286 
4287     if (off >= sizeof(fw_log)) {
4288         return NVME_INVALID_FIELD | NVME_DNR;
4289     }
4290 
4291     strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
4292     trans_len = MIN(sizeof(fw_log) - off, buf_len);
4293 
4294     return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
4295 }
4296 
4297 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4298                                 uint64_t off, NvmeRequest *req)
4299 {
4300     uint32_t trans_len;
4301     NvmeErrorLog errlog;
4302 
4303     if (off >= sizeof(errlog)) {
4304         return NVME_INVALID_FIELD | NVME_DNR;
4305     }
4306 
4307     if (!rae) {
4308         nvme_clear_events(n, NVME_AER_TYPE_ERROR);
4309     }
4310 
4311     memset(&errlog, 0x0, sizeof(errlog));
4312     trans_len = MIN(sizeof(errlog) - off, buf_len);
4313 
4314     return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
4315 }
4316 
4317 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4318                                     uint64_t off, NvmeRequest *req)
4319 {
4320     uint32_t nslist[1024];
4321     uint32_t trans_len;
4322     int i = 0;
4323     uint32_t nsid;
4324 
4325     if (off >= sizeof(nslist)) {
4326         trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
4327         return NVME_INVALID_FIELD | NVME_DNR;
4328     }
4329 
4330     memset(nslist, 0x0, sizeof(nslist));
4331     trans_len = MIN(sizeof(nslist) - off, buf_len);
4332 
4333     while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
4334             NVME_CHANGED_NSID_SIZE) {
4335         /*
4336          * If more than 1024 namespaces, the first entry in the log page should
4337          * be set to FFFFFFFFh and the others to 0 as spec.
4338          */
4339         if (i == ARRAY_SIZE(nslist)) {
4340             memset(nslist, 0x0, sizeof(nslist));
4341             nslist[0] = 0xffffffff;
4342             break;
4343         }
4344 
4345         nslist[i++] = nsid;
4346         clear_bit(nsid, n->changed_nsids);
4347     }
4348 
4349     /*
4350      * Remove all the remaining list entries in case returns directly due to
4351      * more than 1024 namespaces.
4352      */
4353     if (nslist[0] == 0xffffffff) {
4354         bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
4355     }
4356 
4357     if (!rae) {
4358         nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
4359     }
4360 
4361     return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
4362 }
4363 
4364 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
4365                                  uint64_t off, NvmeRequest *req)
4366 {
4367     NvmeEffectsLog log = {};
4368     const uint32_t *src_iocs = NULL;
4369     uint32_t trans_len;
4370 
4371     if (off >= sizeof(log)) {
4372         trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
4373         return NVME_INVALID_FIELD | NVME_DNR;
4374     }
4375 
4376     switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
4377     case NVME_CC_CSS_NVM:
4378         src_iocs = nvme_cse_iocs_nvm;
4379         /* fall through */
4380     case NVME_CC_CSS_ADMIN_ONLY:
4381         break;
4382     case NVME_CC_CSS_CSI:
4383         switch (csi) {
4384         case NVME_CSI_NVM:
4385             src_iocs = nvme_cse_iocs_nvm;
4386             break;
4387         case NVME_CSI_ZONED:
4388             src_iocs = nvme_cse_iocs_zoned;
4389             break;
4390         }
4391     }
4392 
4393     memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4394 
4395     if (src_iocs) {
4396         memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4397     }
4398 
4399     trans_len = MIN(sizeof(log) - off, buf_len);
4400 
4401     return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4402 }
4403 
4404 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4405 {
4406     NvmeCmd *cmd = &req->cmd;
4407 
4408     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4409     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4410     uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4411     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4412     uint8_t  lid = dw10 & 0xff;
4413     uint8_t  lsp = (dw10 >> 8) & 0xf;
4414     uint8_t  rae = (dw10 >> 15) & 0x1;
4415     uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
4416     uint32_t numdl, numdu;
4417     uint64_t off, lpol, lpou;
4418     size_t   len;
4419     uint16_t status;
4420 
4421     numdl = (dw10 >> 16);
4422     numdu = (dw11 & 0xffff);
4423     lpol = dw12;
4424     lpou = dw13;
4425 
4426     len = (((numdu << 16) | numdl) + 1) << 2;
4427     off = (lpou << 32ULL) | lpol;
4428 
4429     if (off & 0x3) {
4430         return NVME_INVALID_FIELD | NVME_DNR;
4431     }
4432 
4433     trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4434 
4435     status = nvme_check_mdts(n, len);
4436     if (status) {
4437         return status;
4438     }
4439 
4440     switch (lid) {
4441     case NVME_LOG_ERROR_INFO:
4442         return nvme_error_info(n, rae, len, off, req);
4443     case NVME_LOG_SMART_INFO:
4444         return nvme_smart_info(n, rae, len, off, req);
4445     case NVME_LOG_FW_SLOT_INFO:
4446         return nvme_fw_log_info(n, len, off, req);
4447     case NVME_LOG_CHANGED_NSLIST:
4448         return nvme_changed_nslist(n, rae, len, off, req);
4449     case NVME_LOG_CMD_EFFECTS:
4450         return nvme_cmd_effects(n, csi, len, off, req);
4451     default:
4452         trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4453         return NVME_INVALID_FIELD | NVME_DNR;
4454     }
4455 }
4456 
4457 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4458 {
4459     n->cq[cq->cqid] = NULL;
4460     timer_free(cq->timer);
4461     if (msix_enabled(&n->parent_obj)) {
4462         msix_vector_unuse(&n->parent_obj, cq->vector);
4463     }
4464     if (cq->cqid) {
4465         g_free(cq);
4466     }
4467 }
4468 
4469 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4470 {
4471     NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4472     NvmeCQueue *cq;
4473     uint16_t qid = le16_to_cpu(c->qid);
4474 
4475     if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4476         trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4477         return NVME_INVALID_CQID | NVME_DNR;
4478     }
4479 
4480     cq = n->cq[qid];
4481     if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4482         trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4483         return NVME_INVALID_QUEUE_DEL;
4484     }
4485 
4486     if (cq->irq_enabled && cq->tail != cq->head) {
4487         n->cq_pending--;
4488     }
4489 
4490     nvme_irq_deassert(n, cq);
4491     trace_pci_nvme_del_cq(qid);
4492     nvme_free_cq(cq, n);
4493     return NVME_SUCCESS;
4494 }
4495 
4496 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4497                          uint16_t cqid, uint16_t vector, uint16_t size,
4498                          uint16_t irq_enabled)
4499 {
4500     int ret;
4501 
4502     if (msix_enabled(&n->parent_obj)) {
4503         ret = msix_vector_use(&n->parent_obj, vector);
4504         assert(ret == 0);
4505     }
4506     cq->ctrl = n;
4507     cq->cqid = cqid;
4508     cq->size = size;
4509     cq->dma_addr = dma_addr;
4510     cq->phase = 1;
4511     cq->irq_enabled = irq_enabled;
4512     cq->vector = vector;
4513     cq->head = cq->tail = 0;
4514     QTAILQ_INIT(&cq->req_list);
4515     QTAILQ_INIT(&cq->sq_list);
4516     n->cq[cqid] = cq;
4517     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4518 }
4519 
4520 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4521 {
4522     NvmeCQueue *cq;
4523     NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4524     uint16_t cqid = le16_to_cpu(c->cqid);
4525     uint16_t vector = le16_to_cpu(c->irq_vector);
4526     uint16_t qsize = le16_to_cpu(c->qsize);
4527     uint16_t qflags = le16_to_cpu(c->cq_flags);
4528     uint64_t prp1 = le64_to_cpu(c->prp1);
4529 
4530     trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4531                              NVME_CQ_FLAGS_IEN(qflags) != 0);
4532 
4533     if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
4534         n->cq[cqid] != NULL)) {
4535         trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4536         return NVME_INVALID_QID | NVME_DNR;
4537     }
4538     if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4539         trace_pci_nvme_err_invalid_create_cq_size(qsize);
4540         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4541     }
4542     if (unlikely(prp1 & (n->page_size - 1))) {
4543         trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4544         return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4545     }
4546     if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4547         trace_pci_nvme_err_invalid_create_cq_vector(vector);
4548         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4549     }
4550     if (unlikely(vector >= n->params.msix_qsize)) {
4551         trace_pci_nvme_err_invalid_create_cq_vector(vector);
4552         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4553     }
4554     if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4555         trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4556         return NVME_INVALID_FIELD | NVME_DNR;
4557     }
4558 
4559     cq = g_malloc0(sizeof(*cq));
4560     nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4561                  NVME_CQ_FLAGS_IEN(qflags));
4562 
4563     /*
4564      * It is only required to set qs_created when creating a completion queue;
4565      * creating a submission queue without a matching completion queue will
4566      * fail.
4567      */
4568     n->qs_created = true;
4569     return NVME_SUCCESS;
4570 }
4571 
4572 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4573 {
4574     uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4575 
4576     return nvme_c2h(n, id, sizeof(id), req);
4577 }
4578 
4579 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4580 {
4581     trace_pci_nvme_identify_ctrl();
4582 
4583     return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4584 }
4585 
4586 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4587 {
4588     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4589     uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4590     NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4591 
4592     trace_pci_nvme_identify_ctrl_csi(c->csi);
4593 
4594     switch (c->csi) {
4595     case NVME_CSI_NVM:
4596         id_nvm->vsl = n->params.vsl;
4597         id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4598         break;
4599 
4600     case NVME_CSI_ZONED:
4601         ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4602         break;
4603 
4604     default:
4605         return NVME_INVALID_FIELD | NVME_DNR;
4606     }
4607 
4608     return nvme_c2h(n, id, sizeof(id), req);
4609 }
4610 
4611 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4612 {
4613     NvmeNamespace *ns;
4614     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4615     uint32_t nsid = le32_to_cpu(c->nsid);
4616 
4617     trace_pci_nvme_identify_ns(nsid);
4618 
4619     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4620         return NVME_INVALID_NSID | NVME_DNR;
4621     }
4622 
4623     ns = nvme_ns(n, nsid);
4624     if (unlikely(!ns)) {
4625         if (!active) {
4626             ns = nvme_subsys_ns(n->subsys, nsid);
4627             if (!ns) {
4628                 return nvme_rpt_empty_id_struct(n, req);
4629             }
4630         } else {
4631             return nvme_rpt_empty_id_struct(n, req);
4632         }
4633     }
4634 
4635     if (active || ns->csi == NVME_CSI_NVM) {
4636         return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4637     }
4638 
4639     return NVME_INVALID_CMD_SET | NVME_DNR;
4640 }
4641 
4642 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
4643                                         bool attached)
4644 {
4645     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4646     uint32_t nsid = le32_to_cpu(c->nsid);
4647     uint16_t min_id = le16_to_cpu(c->ctrlid);
4648     uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4649     uint16_t *ids = &list[1];
4650     NvmeNamespace *ns;
4651     NvmeCtrl *ctrl;
4652     int cntlid, nr_ids = 0;
4653 
4654     trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
4655 
4656     if (!n->subsys) {
4657         return NVME_INVALID_FIELD | NVME_DNR;
4658     }
4659 
4660     if (attached) {
4661         if (nsid == NVME_NSID_BROADCAST) {
4662             return NVME_INVALID_FIELD | NVME_DNR;
4663         }
4664 
4665         ns = nvme_subsys_ns(n->subsys, nsid);
4666         if (!ns) {
4667             return NVME_INVALID_FIELD | NVME_DNR;
4668         }
4669     }
4670 
4671     for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4672         ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4673         if (!ctrl) {
4674             continue;
4675         }
4676 
4677         if (attached && !nvme_ns(ctrl, nsid)) {
4678             continue;
4679         }
4680 
4681         ids[nr_ids++] = cntlid;
4682     }
4683 
4684     list[0] = nr_ids;
4685 
4686     return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4687 }
4688 
4689 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4690                                      bool active)
4691 {
4692     NvmeNamespace *ns;
4693     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4694     uint32_t nsid = le32_to_cpu(c->nsid);
4695 
4696     trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4697 
4698     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4699         return NVME_INVALID_NSID | NVME_DNR;
4700     }
4701 
4702     ns = nvme_ns(n, nsid);
4703     if (unlikely(!ns)) {
4704         if (!active) {
4705             ns = nvme_subsys_ns(n->subsys, nsid);
4706             if (!ns) {
4707                 return nvme_rpt_empty_id_struct(n, req);
4708             }
4709         } else {
4710             return nvme_rpt_empty_id_struct(n, req);
4711         }
4712     }
4713 
4714     if (c->csi == NVME_CSI_NVM) {
4715         return nvme_rpt_empty_id_struct(n, req);
4716     } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
4717         return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
4718                         req);
4719     }
4720 
4721     return NVME_INVALID_FIELD | NVME_DNR;
4722 }
4723 
4724 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
4725                                      bool active)
4726 {
4727     NvmeNamespace *ns;
4728     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4729     uint32_t min_nsid = le32_to_cpu(c->nsid);
4730     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4731     static const int data_len = sizeof(list);
4732     uint32_t *list_ptr = (uint32_t *)list;
4733     int i, j = 0;
4734 
4735     trace_pci_nvme_identify_nslist(min_nsid);
4736 
4737     /*
4738      * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
4739      * since the Active Namespace ID List should return namespaces with ids
4740      * *higher* than the NSID specified in the command. This is also specified
4741      * in the spec (NVM Express v1.3d, Section 5.15.4).
4742      */
4743     if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4744         return NVME_INVALID_NSID | NVME_DNR;
4745     }
4746 
4747     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4748         ns = nvme_ns(n, i);
4749         if (!ns) {
4750             if (!active) {
4751                 ns = nvme_subsys_ns(n->subsys, i);
4752                 if (!ns) {
4753                     continue;
4754                 }
4755             } else {
4756                 continue;
4757             }
4758         }
4759         if (ns->params.nsid <= min_nsid) {
4760             continue;
4761         }
4762         list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4763         if (j == data_len / sizeof(uint32_t)) {
4764             break;
4765         }
4766     }
4767 
4768     return nvme_c2h(n, list, data_len, req);
4769 }
4770 
4771 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
4772                                          bool active)
4773 {
4774     NvmeNamespace *ns;
4775     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4776     uint32_t min_nsid = le32_to_cpu(c->nsid);
4777     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4778     static const int data_len = sizeof(list);
4779     uint32_t *list_ptr = (uint32_t *)list;
4780     int i, j = 0;
4781 
4782     trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
4783 
4784     /*
4785      * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
4786      */
4787     if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4788         return NVME_INVALID_NSID | NVME_DNR;
4789     }
4790 
4791     if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
4792         return NVME_INVALID_FIELD | NVME_DNR;
4793     }
4794 
4795     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4796         ns = nvme_ns(n, i);
4797         if (!ns) {
4798             if (!active) {
4799                 ns = nvme_subsys_ns(n->subsys, i);
4800                 if (!ns) {
4801                     continue;
4802                 }
4803             } else {
4804                 continue;
4805             }
4806         }
4807         if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
4808             continue;
4809         }
4810         list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4811         if (j == data_len / sizeof(uint32_t)) {
4812             break;
4813         }
4814     }
4815 
4816     return nvme_c2h(n, list, data_len, req);
4817 }
4818 
4819 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
4820 {
4821     NvmeNamespace *ns;
4822     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4823     uint32_t nsid = le32_to_cpu(c->nsid);
4824     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4825     uint8_t *pos = list;
4826     struct {
4827         NvmeIdNsDescr hdr;
4828         uint8_t v[NVME_NIDL_UUID];
4829     } QEMU_PACKED uuid = {};
4830     struct {
4831         NvmeIdNsDescr hdr;
4832         uint64_t v;
4833     } QEMU_PACKED eui64 = {};
4834     struct {
4835         NvmeIdNsDescr hdr;
4836         uint8_t v;
4837     } QEMU_PACKED csi = {};
4838 
4839     trace_pci_nvme_identify_ns_descr_list(nsid);
4840 
4841     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4842         return NVME_INVALID_NSID | NVME_DNR;
4843     }
4844 
4845     ns = nvme_ns(n, nsid);
4846     if (unlikely(!ns)) {
4847         return NVME_INVALID_FIELD | NVME_DNR;
4848     }
4849 
4850     /*
4851      * If the EUI-64 field is 0 and the NGUID field is 0, the namespace must
4852      * provide a valid Namespace UUID in the Namespace Identification Descriptor
4853      * data structure. QEMU does not yet support setting NGUID.
4854      */
4855     uuid.hdr.nidt = NVME_NIDT_UUID;
4856     uuid.hdr.nidl = NVME_NIDL_UUID;
4857     memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
4858     memcpy(pos, &uuid, sizeof(uuid));
4859     pos += sizeof(uuid);
4860 
4861     if (ns->params.eui64) {
4862         eui64.hdr.nidt = NVME_NIDT_EUI64;
4863         eui64.hdr.nidl = NVME_NIDL_EUI64;
4864         eui64.v = cpu_to_be64(ns->params.eui64);
4865         memcpy(pos, &eui64, sizeof(eui64));
4866         pos += sizeof(eui64);
4867     }
4868 
4869     csi.hdr.nidt = NVME_NIDT_CSI;
4870     csi.hdr.nidl = NVME_NIDL_CSI;
4871     csi.v = ns->csi;
4872     memcpy(pos, &csi, sizeof(csi));
4873     pos += sizeof(csi);
4874 
4875     return nvme_c2h(n, list, sizeof(list), req);
4876 }
4877 
4878 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
4879 {
4880     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4881     static const int data_len = sizeof(list);
4882 
4883     trace_pci_nvme_identify_cmd_set();
4884 
4885     NVME_SET_CSI(*list, NVME_CSI_NVM);
4886     NVME_SET_CSI(*list, NVME_CSI_ZONED);
4887 
4888     return nvme_c2h(n, list, data_len, req);
4889 }
4890 
4891 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
4892 {
4893     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4894 
4895     trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
4896                             c->csi);
4897 
4898     switch (c->cns) {
4899     case NVME_ID_CNS_NS:
4900         return nvme_identify_ns(n, req, true);
4901     case NVME_ID_CNS_NS_PRESENT:
4902         return nvme_identify_ns(n, req, false);
4903     case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
4904         return nvme_identify_ctrl_list(n, req, true);
4905     case NVME_ID_CNS_CTRL_LIST:
4906         return nvme_identify_ctrl_list(n, req, false);
4907     case NVME_ID_CNS_CS_NS:
4908         return nvme_identify_ns_csi(n, req, true);
4909     case NVME_ID_CNS_CS_NS_PRESENT:
4910         return nvme_identify_ns_csi(n, req, false);
4911     case NVME_ID_CNS_CTRL:
4912         return nvme_identify_ctrl(n, req);
4913     case NVME_ID_CNS_CS_CTRL:
4914         return nvme_identify_ctrl_csi(n, req);
4915     case NVME_ID_CNS_NS_ACTIVE_LIST:
4916         return nvme_identify_nslist(n, req, true);
4917     case NVME_ID_CNS_NS_PRESENT_LIST:
4918         return nvme_identify_nslist(n, req, false);
4919     case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
4920         return nvme_identify_nslist_csi(n, req, true);
4921     case NVME_ID_CNS_CS_NS_PRESENT_LIST:
4922         return nvme_identify_nslist_csi(n, req, false);
4923     case NVME_ID_CNS_NS_DESCR_LIST:
4924         return nvme_identify_ns_descr_list(n, req);
4925     case NVME_ID_CNS_IO_COMMAND_SET:
4926         return nvme_identify_cmd_set(n, req);
4927     default:
4928         trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
4929         return NVME_INVALID_FIELD | NVME_DNR;
4930     }
4931 }
4932 
4933 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
4934 {
4935     uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
4936 
4937     req->cqe.result = 1;
4938     if (nvme_check_sqid(n, sqid)) {
4939         return NVME_INVALID_FIELD | NVME_DNR;
4940     }
4941 
4942     return NVME_SUCCESS;
4943 }
4944 
4945 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
4946 {
4947     trace_pci_nvme_setfeat_timestamp(ts);
4948 
4949     n->host_timestamp = le64_to_cpu(ts);
4950     n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4951 }
4952 
4953 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
4954 {
4955     uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4956     uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
4957 
4958     union nvme_timestamp {
4959         struct {
4960             uint64_t timestamp:48;
4961             uint64_t sync:1;
4962             uint64_t origin:3;
4963             uint64_t rsvd1:12;
4964         };
4965         uint64_t all;
4966     };
4967 
4968     union nvme_timestamp ts;
4969     ts.all = 0;
4970     ts.timestamp = n->host_timestamp + elapsed_time;
4971 
4972     /* If the host timestamp is non-zero, set the timestamp origin */
4973     ts.origin = n->host_timestamp ? 0x01 : 0x00;
4974 
4975     trace_pci_nvme_getfeat_timestamp(ts.all);
4976 
4977     return cpu_to_le64(ts.all);
4978 }
4979 
4980 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4981 {
4982     uint64_t timestamp = nvme_get_timestamp(n);
4983 
4984     return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
4985 }
4986 
4987 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
4988 {
4989     NvmeCmd *cmd = &req->cmd;
4990     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4991     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4992     uint32_t nsid = le32_to_cpu(cmd->nsid);
4993     uint32_t result;
4994     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4995     NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
4996     uint16_t iv;
4997     NvmeNamespace *ns;
4998     int i;
4999 
5000     static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
5001         [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
5002     };
5003 
5004     trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
5005 
5006     if (!nvme_feature_support[fid]) {
5007         return NVME_INVALID_FIELD | NVME_DNR;
5008     }
5009 
5010     if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5011         if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5012             /*
5013              * The Reservation Notification Mask and Reservation Persistence
5014              * features require a status code of Invalid Field in Command when
5015              * NSID is FFFFFFFFh. Since the device does not support those
5016              * features we can always return Invalid Namespace or Format as we
5017              * should do for all other features.
5018              */
5019             return NVME_INVALID_NSID | NVME_DNR;
5020         }
5021 
5022         if (!nvme_ns(n, nsid)) {
5023             return NVME_INVALID_FIELD | NVME_DNR;
5024         }
5025     }
5026 
5027     switch (sel) {
5028     case NVME_GETFEAT_SELECT_CURRENT:
5029         break;
5030     case NVME_GETFEAT_SELECT_SAVED:
5031         /* no features are saveable by the controller; fallthrough */
5032     case NVME_GETFEAT_SELECT_DEFAULT:
5033         goto defaults;
5034     case NVME_GETFEAT_SELECT_CAP:
5035         result = nvme_feature_cap[fid];
5036         goto out;
5037     }
5038 
5039     switch (fid) {
5040     case NVME_TEMPERATURE_THRESHOLD:
5041         result = 0;
5042 
5043         /*
5044          * The controller only implements the Composite Temperature sensor, so
5045          * return 0 for all other sensors.
5046          */
5047         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5048             goto out;
5049         }
5050 
5051         switch (NVME_TEMP_THSEL(dw11)) {
5052         case NVME_TEMP_THSEL_OVER:
5053             result = n->features.temp_thresh_hi;
5054             goto out;
5055         case NVME_TEMP_THSEL_UNDER:
5056             result = n->features.temp_thresh_low;
5057             goto out;
5058         }
5059 
5060         return NVME_INVALID_FIELD | NVME_DNR;
5061     case NVME_ERROR_RECOVERY:
5062         if (!nvme_nsid_valid(n, nsid)) {
5063             return NVME_INVALID_NSID | NVME_DNR;
5064         }
5065 
5066         ns = nvme_ns(n, nsid);
5067         if (unlikely(!ns)) {
5068             return NVME_INVALID_FIELD | NVME_DNR;
5069         }
5070 
5071         result = ns->features.err_rec;
5072         goto out;
5073     case NVME_VOLATILE_WRITE_CACHE:
5074         result = 0;
5075         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5076             ns = nvme_ns(n, i);
5077             if (!ns) {
5078                 continue;
5079             }
5080 
5081             result = blk_enable_write_cache(ns->blkconf.blk);
5082             if (result) {
5083                 break;
5084             }
5085         }
5086         trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
5087         goto out;
5088     case NVME_ASYNCHRONOUS_EVENT_CONF:
5089         result = n->features.async_config;
5090         goto out;
5091     case NVME_TIMESTAMP:
5092         return nvme_get_feature_timestamp(n, req);
5093     default:
5094         break;
5095     }
5096 
5097 defaults:
5098     switch (fid) {
5099     case NVME_TEMPERATURE_THRESHOLD:
5100         result = 0;
5101 
5102         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5103             break;
5104         }
5105 
5106         if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
5107             result = NVME_TEMPERATURE_WARNING;
5108         }
5109 
5110         break;
5111     case NVME_NUMBER_OF_QUEUES:
5112         result = (n->params.max_ioqpairs - 1) |
5113             ((n->params.max_ioqpairs - 1) << 16);
5114         trace_pci_nvme_getfeat_numq(result);
5115         break;
5116     case NVME_INTERRUPT_VECTOR_CONF:
5117         iv = dw11 & 0xffff;
5118         if (iv >= n->params.max_ioqpairs + 1) {
5119             return NVME_INVALID_FIELD | NVME_DNR;
5120         }
5121 
5122         result = iv;
5123         if (iv == n->admin_cq.vector) {
5124             result |= NVME_INTVC_NOCOALESCING;
5125         }
5126         break;
5127     default:
5128         result = nvme_feature_default[fid];
5129         break;
5130     }
5131 
5132 out:
5133     req->cqe.result = cpu_to_le32(result);
5134     return NVME_SUCCESS;
5135 }
5136 
5137 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
5138 {
5139     uint16_t ret;
5140     uint64_t timestamp;
5141 
5142     ret = nvme_h2c(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
5143     if (ret) {
5144         return ret;
5145     }
5146 
5147     nvme_set_timestamp(n, timestamp);
5148 
5149     return NVME_SUCCESS;
5150 }
5151 
5152 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
5153 {
5154     NvmeNamespace *ns = NULL;
5155 
5156     NvmeCmd *cmd = &req->cmd;
5157     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5158     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5159     uint32_t nsid = le32_to_cpu(cmd->nsid);
5160     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
5161     uint8_t save = NVME_SETFEAT_SAVE(dw10);
5162     int i;
5163 
5164     trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
5165 
5166     if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
5167         return NVME_FID_NOT_SAVEABLE | NVME_DNR;
5168     }
5169 
5170     if (!nvme_feature_support[fid]) {
5171         return NVME_INVALID_FIELD | NVME_DNR;
5172     }
5173 
5174     if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5175         if (nsid != NVME_NSID_BROADCAST) {
5176             if (!nvme_nsid_valid(n, nsid)) {
5177                 return NVME_INVALID_NSID | NVME_DNR;
5178             }
5179 
5180             ns = nvme_ns(n, nsid);
5181             if (unlikely(!ns)) {
5182                 return NVME_INVALID_FIELD | NVME_DNR;
5183             }
5184         }
5185     } else if (nsid && nsid != NVME_NSID_BROADCAST) {
5186         if (!nvme_nsid_valid(n, nsid)) {
5187             return NVME_INVALID_NSID | NVME_DNR;
5188         }
5189 
5190         return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
5191     }
5192 
5193     if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
5194         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5195     }
5196 
5197     switch (fid) {
5198     case NVME_TEMPERATURE_THRESHOLD:
5199         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5200             break;
5201         }
5202 
5203         switch (NVME_TEMP_THSEL(dw11)) {
5204         case NVME_TEMP_THSEL_OVER:
5205             n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
5206             break;
5207         case NVME_TEMP_THSEL_UNDER:
5208             n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
5209             break;
5210         default:
5211             return NVME_INVALID_FIELD | NVME_DNR;
5212         }
5213 
5214         if ((n->temperature >= n->features.temp_thresh_hi) ||
5215             (n->temperature <= n->features.temp_thresh_low)) {
5216             nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
5217         }
5218 
5219         break;
5220     case NVME_ERROR_RECOVERY:
5221         if (nsid == NVME_NSID_BROADCAST) {
5222             for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5223                 ns = nvme_ns(n, i);
5224 
5225                 if (!ns) {
5226                     continue;
5227                 }
5228 
5229                 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5230                     ns->features.err_rec = dw11;
5231                 }
5232             }
5233 
5234             break;
5235         }
5236 
5237         assert(ns);
5238         if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat))  {
5239             ns->features.err_rec = dw11;
5240         }
5241         break;
5242     case NVME_VOLATILE_WRITE_CACHE:
5243         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5244             ns = nvme_ns(n, i);
5245             if (!ns) {
5246                 continue;
5247             }
5248 
5249             if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
5250                 blk_flush(ns->blkconf.blk);
5251             }
5252 
5253             blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
5254         }
5255 
5256         break;
5257 
5258     case NVME_NUMBER_OF_QUEUES:
5259         if (n->qs_created) {
5260             return NVME_CMD_SEQ_ERROR | NVME_DNR;
5261         }
5262 
5263         /*
5264          * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
5265          * and NSQR.
5266          */
5267         if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
5268             return NVME_INVALID_FIELD | NVME_DNR;
5269         }
5270 
5271         trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
5272                                     ((dw11 >> 16) & 0xffff) + 1,
5273                                     n->params.max_ioqpairs,
5274                                     n->params.max_ioqpairs);
5275         req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
5276                                       ((n->params.max_ioqpairs - 1) << 16));
5277         break;
5278     case NVME_ASYNCHRONOUS_EVENT_CONF:
5279         n->features.async_config = dw11;
5280         break;
5281     case NVME_TIMESTAMP:
5282         return nvme_set_feature_timestamp(n, req);
5283     case NVME_COMMAND_SET_PROFILE:
5284         if (dw11 & 0x1ff) {
5285             trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
5286             return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
5287         }
5288         break;
5289     default:
5290         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5291     }
5292     return NVME_SUCCESS;
5293 }
5294 
5295 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
5296 {
5297     trace_pci_nvme_aer(nvme_cid(req));
5298 
5299     if (n->outstanding_aers > n->params.aerl) {
5300         trace_pci_nvme_aer_aerl_exceeded();
5301         return NVME_AER_LIMIT_EXCEEDED;
5302     }
5303 
5304     n->aer_reqs[n->outstanding_aers] = req;
5305     n->outstanding_aers++;
5306 
5307     if (!QTAILQ_EMPTY(&n->aer_queue)) {
5308         nvme_process_aers(n);
5309     }
5310 
5311     return NVME_NO_COMPLETE;
5312 }
5313 
5314 static void nvme_update_dmrsl(NvmeCtrl *n)
5315 {
5316     int nsid;
5317 
5318     for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
5319         NvmeNamespace *ns = nvme_ns(n, nsid);
5320         if (!ns) {
5321             continue;
5322         }
5323 
5324         n->dmrsl = MIN_NON_ZERO(n->dmrsl,
5325                                 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
5326     }
5327 }
5328 
5329 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
5330 {
5331     uint32_t cc = ldl_le_p(&n->bar.cc);
5332 
5333     ns->iocs = nvme_cse_iocs_none;
5334     switch (ns->csi) {
5335     case NVME_CSI_NVM:
5336         if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
5337             ns->iocs = nvme_cse_iocs_nvm;
5338         }
5339         break;
5340     case NVME_CSI_ZONED:
5341         if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
5342             ns->iocs = nvme_cse_iocs_zoned;
5343         } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
5344             ns->iocs = nvme_cse_iocs_nvm;
5345         }
5346         break;
5347     }
5348 }
5349 
5350 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
5351 {
5352     NvmeNamespace *ns;
5353     NvmeCtrl *ctrl;
5354     uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5355     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5356     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5357     uint8_t sel = dw10 & 0xf;
5358     uint16_t *nr_ids = &list[0];
5359     uint16_t *ids = &list[1];
5360     uint16_t ret;
5361     int i;
5362 
5363     trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
5364 
5365     if (!nvme_nsid_valid(n, nsid)) {
5366         return NVME_INVALID_NSID | NVME_DNR;
5367     }
5368 
5369     ns = nvme_subsys_ns(n->subsys, nsid);
5370     if (!ns) {
5371         return NVME_INVALID_FIELD | NVME_DNR;
5372     }
5373 
5374     ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
5375     if (ret) {
5376         return ret;
5377     }
5378 
5379     if (!*nr_ids) {
5380         return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5381     }
5382 
5383     *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
5384     for (i = 0; i < *nr_ids; i++) {
5385         ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
5386         if (!ctrl) {
5387             return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5388         }
5389 
5390         switch (sel) {
5391         case NVME_NS_ATTACHMENT_ATTACH:
5392             if (nvme_ns(ctrl, nsid)) {
5393                 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
5394             }
5395 
5396             if (ns->attached && !ns->params.shared) {
5397                 return NVME_NS_PRIVATE | NVME_DNR;
5398             }
5399 
5400             nvme_attach_ns(ctrl, ns);
5401             nvme_select_iocs_ns(ctrl, ns);
5402 
5403             break;
5404 
5405         case NVME_NS_ATTACHMENT_DETACH:
5406             if (!nvme_ns(ctrl, nsid)) {
5407                 return NVME_NS_NOT_ATTACHED | NVME_DNR;
5408             }
5409 
5410             ctrl->namespaces[nsid] = NULL;
5411             ns->attached--;
5412 
5413             nvme_update_dmrsl(ctrl);
5414 
5415             break;
5416 
5417         default:
5418             return NVME_INVALID_FIELD | NVME_DNR;
5419         }
5420 
5421         /*
5422          * Add namespace id to the changed namespace id list for event clearing
5423          * via Get Log Page command.
5424          */
5425         if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
5426             nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
5427                                NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
5428                                NVME_LOG_CHANGED_NSLIST);
5429         }
5430     }
5431 
5432     return NVME_SUCCESS;
5433 }
5434 
5435 typedef struct NvmeFormatAIOCB {
5436     BlockAIOCB common;
5437     BlockAIOCB *aiocb;
5438     QEMUBH *bh;
5439     NvmeRequest *req;
5440     int ret;
5441 
5442     NvmeNamespace *ns;
5443     uint32_t nsid;
5444     bool broadcast;
5445     int64_t offset;
5446 } NvmeFormatAIOCB;
5447 
5448 static void nvme_format_bh(void *opaque);
5449 
5450 static void nvme_format_cancel(BlockAIOCB *aiocb)
5451 {
5452     NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
5453 
5454     if (iocb->aiocb) {
5455         blk_aio_cancel_async(iocb->aiocb);
5456     }
5457 }
5458 
5459 static const AIOCBInfo nvme_format_aiocb_info = {
5460     .aiocb_size = sizeof(NvmeFormatAIOCB),
5461     .cancel_async = nvme_format_cancel,
5462     .get_aio_context = nvme_get_aio_context,
5463 };
5464 
5465 static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd)
5466 {
5467     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5468     uint8_t lbaf = dw10 & 0xf;
5469     uint8_t pi = (dw10 >> 5) & 0x7;
5470     uint8_t mset = (dw10 >> 4) & 0x1;
5471     uint8_t pil = (dw10 >> 8) & 0x1;
5472 
5473     trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
5474 
5475     ns->id_ns.dps = (pil << 3) | pi;
5476     ns->id_ns.flbas = lbaf | (mset << 4);
5477 
5478     nvme_ns_init_format(ns);
5479 }
5480 
5481 static void nvme_format_ns_cb(void *opaque, int ret)
5482 {
5483     NvmeFormatAIOCB *iocb = opaque;
5484     NvmeRequest *req = iocb->req;
5485     NvmeNamespace *ns = iocb->ns;
5486     int bytes;
5487 
5488     if (ret < 0) {
5489         iocb->ret = ret;
5490         goto done;
5491     }
5492 
5493     assert(ns);
5494 
5495     if (iocb->offset < ns->size) {
5496         bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
5497 
5498         iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
5499                                             bytes, BDRV_REQ_MAY_UNMAP,
5500                                             nvme_format_ns_cb, iocb);
5501 
5502         iocb->offset += bytes;
5503         return;
5504     }
5505 
5506     nvme_format_set(ns, &req->cmd);
5507     ns->status = 0x0;
5508     iocb->ns = NULL;
5509     iocb->offset = 0;
5510 
5511 done:
5512     iocb->aiocb = NULL;
5513     qemu_bh_schedule(iocb->bh);
5514 }
5515 
5516 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
5517 {
5518     if (ns->params.zoned) {
5519         return NVME_INVALID_FORMAT | NVME_DNR;
5520     }
5521 
5522     if (lbaf > ns->id_ns.nlbaf) {
5523         return NVME_INVALID_FORMAT | NVME_DNR;
5524     }
5525 
5526     if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) {
5527         return NVME_INVALID_FORMAT | NVME_DNR;
5528     }
5529 
5530     if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5531         return NVME_INVALID_FIELD | NVME_DNR;
5532     }
5533 
5534     return NVME_SUCCESS;
5535 }
5536 
5537 static void nvme_format_bh(void *opaque)
5538 {
5539     NvmeFormatAIOCB *iocb = opaque;
5540     NvmeRequest *req = iocb->req;
5541     NvmeCtrl *n = nvme_ctrl(req);
5542     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5543     uint8_t lbaf = dw10 & 0xf;
5544     uint8_t pi = (dw10 >> 5) & 0x7;
5545     uint16_t status;
5546     int i;
5547 
5548     if (iocb->ret < 0) {
5549         goto done;
5550     }
5551 
5552     if (iocb->broadcast) {
5553         for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
5554             iocb->ns = nvme_ns(n, i);
5555             if (iocb->ns) {
5556                 iocb->nsid = i;
5557                 break;
5558             }
5559         }
5560     }
5561 
5562     if (!iocb->ns) {
5563         goto done;
5564     }
5565 
5566     status = nvme_format_check(iocb->ns, lbaf, pi);
5567     if (status) {
5568         req->status = status;
5569         goto done;
5570     }
5571 
5572     iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
5573     nvme_format_ns_cb(iocb, 0);
5574     return;
5575 
5576 done:
5577     qemu_bh_delete(iocb->bh);
5578     iocb->bh = NULL;
5579 
5580     iocb->common.cb(iocb->common.opaque, iocb->ret);
5581 
5582     qemu_aio_unref(iocb);
5583 }
5584 
5585 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5586 {
5587     NvmeFormatAIOCB *iocb;
5588     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5589     uint16_t status;
5590 
5591     iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
5592 
5593     iocb->req = req;
5594     iocb->bh = qemu_bh_new(nvme_format_bh, iocb);
5595     iocb->ret = 0;
5596     iocb->ns = NULL;
5597     iocb->nsid = 0;
5598     iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
5599     iocb->offset = 0;
5600 
5601     if (!iocb->broadcast) {
5602         if (!nvme_nsid_valid(n, nsid)) {
5603             status = NVME_INVALID_NSID | NVME_DNR;
5604             goto out;
5605         }
5606 
5607         iocb->ns = nvme_ns(n, nsid);
5608         if (!iocb->ns) {
5609             status = NVME_INVALID_FIELD | NVME_DNR;
5610             goto out;
5611         }
5612     }
5613 
5614     req->aiocb = &iocb->common;
5615     qemu_bh_schedule(iocb->bh);
5616 
5617     return NVME_NO_COMPLETE;
5618 
5619 out:
5620     qemu_bh_delete(iocb->bh);
5621     iocb->bh = NULL;
5622     qemu_aio_unref(iocb);
5623     return status;
5624 }
5625 
5626 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
5627 {
5628     trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
5629                              nvme_adm_opc_str(req->cmd.opcode));
5630 
5631     if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
5632         trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
5633         return NVME_INVALID_OPCODE | NVME_DNR;
5634     }
5635 
5636     /* SGLs shall not be used for Admin commands in NVMe over PCIe */
5637     if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
5638         return NVME_INVALID_FIELD | NVME_DNR;
5639     }
5640 
5641     if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
5642         return NVME_INVALID_FIELD;
5643     }
5644 
5645     switch (req->cmd.opcode) {
5646     case NVME_ADM_CMD_DELETE_SQ:
5647         return nvme_del_sq(n, req);
5648     case NVME_ADM_CMD_CREATE_SQ:
5649         return nvme_create_sq(n, req);
5650     case NVME_ADM_CMD_GET_LOG_PAGE:
5651         return nvme_get_log(n, req);
5652     case NVME_ADM_CMD_DELETE_CQ:
5653         return nvme_del_cq(n, req);
5654     case NVME_ADM_CMD_CREATE_CQ:
5655         return nvme_create_cq(n, req);
5656     case NVME_ADM_CMD_IDENTIFY:
5657         return nvme_identify(n, req);
5658     case NVME_ADM_CMD_ABORT:
5659         return nvme_abort(n, req);
5660     case NVME_ADM_CMD_SET_FEATURES:
5661         return nvme_set_feature(n, req);
5662     case NVME_ADM_CMD_GET_FEATURES:
5663         return nvme_get_feature(n, req);
5664     case NVME_ADM_CMD_ASYNC_EV_REQ:
5665         return nvme_aer(n, req);
5666     case NVME_ADM_CMD_NS_ATTACHMENT:
5667         return nvme_ns_attachment(n, req);
5668     case NVME_ADM_CMD_FORMAT_NVM:
5669         return nvme_format(n, req);
5670     default:
5671         assert(false);
5672     }
5673 
5674     return NVME_INVALID_OPCODE | NVME_DNR;
5675 }
5676 
5677 static void nvme_process_sq(void *opaque)
5678 {
5679     NvmeSQueue *sq = opaque;
5680     NvmeCtrl *n = sq->ctrl;
5681     NvmeCQueue *cq = n->cq[sq->cqid];
5682 
5683     uint16_t status;
5684     hwaddr addr;
5685     NvmeCmd cmd;
5686     NvmeRequest *req;
5687 
5688     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
5689         addr = sq->dma_addr + sq->head * n->sqe_size;
5690         if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
5691             trace_pci_nvme_err_addr_read(addr);
5692             trace_pci_nvme_err_cfs();
5693             stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
5694             break;
5695         }
5696         nvme_inc_sq_head(sq);
5697 
5698         req = QTAILQ_FIRST(&sq->req_list);
5699         QTAILQ_REMOVE(&sq->req_list, req, entry);
5700         QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
5701         nvme_req_clear(req);
5702         req->cqe.cid = cmd.cid;
5703         memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
5704 
5705         status = sq->sqid ? nvme_io_cmd(n, req) :
5706             nvme_admin_cmd(n, req);
5707         if (status != NVME_NO_COMPLETE) {
5708             req->status = status;
5709             nvme_enqueue_req_completion(cq, req);
5710         }
5711     }
5712 }
5713 
5714 static void nvme_ctrl_reset(NvmeCtrl *n)
5715 {
5716     NvmeNamespace *ns;
5717     int i;
5718 
5719     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5720         ns = nvme_ns(n, i);
5721         if (!ns) {
5722             continue;
5723         }
5724 
5725         nvme_ns_drain(ns);
5726     }
5727 
5728     for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5729         if (n->sq[i] != NULL) {
5730             nvme_free_sq(n->sq[i], n);
5731         }
5732     }
5733     for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5734         if (n->cq[i] != NULL) {
5735             nvme_free_cq(n->cq[i], n);
5736         }
5737     }
5738 
5739     while (!QTAILQ_EMPTY(&n->aer_queue)) {
5740         NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
5741         QTAILQ_REMOVE(&n->aer_queue, event, entry);
5742         g_free(event);
5743     }
5744 
5745     n->aer_queued = 0;
5746     n->outstanding_aers = 0;
5747     n->qs_created = false;
5748 }
5749 
5750 static void nvme_ctrl_shutdown(NvmeCtrl *n)
5751 {
5752     NvmeNamespace *ns;
5753     int i;
5754 
5755     if (n->pmr.dev) {
5756         memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5757     }
5758 
5759     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5760         ns = nvme_ns(n, i);
5761         if (!ns) {
5762             continue;
5763         }
5764 
5765         nvme_ns_shutdown(ns);
5766     }
5767 }
5768 
5769 static void nvme_select_iocs(NvmeCtrl *n)
5770 {
5771     NvmeNamespace *ns;
5772     int i;
5773 
5774     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5775         ns = nvme_ns(n, i);
5776         if (!ns) {
5777             continue;
5778         }
5779 
5780         nvme_select_iocs_ns(n, ns);
5781     }
5782 }
5783 
5784 static int nvme_start_ctrl(NvmeCtrl *n)
5785 {
5786     uint64_t cap = ldq_le_p(&n->bar.cap);
5787     uint32_t cc = ldl_le_p(&n->bar.cc);
5788     uint32_t aqa = ldl_le_p(&n->bar.aqa);
5789     uint64_t asq = ldq_le_p(&n->bar.asq);
5790     uint64_t acq = ldq_le_p(&n->bar.acq);
5791     uint32_t page_bits = NVME_CC_MPS(cc) + 12;
5792     uint32_t page_size = 1 << page_bits;
5793 
5794     if (unlikely(n->cq[0])) {
5795         trace_pci_nvme_err_startfail_cq();
5796         return -1;
5797     }
5798     if (unlikely(n->sq[0])) {
5799         trace_pci_nvme_err_startfail_sq();
5800         return -1;
5801     }
5802     if (unlikely(asq & (page_size - 1))) {
5803         trace_pci_nvme_err_startfail_asq_misaligned(asq);
5804         return -1;
5805     }
5806     if (unlikely(acq & (page_size - 1))) {
5807         trace_pci_nvme_err_startfail_acq_misaligned(acq);
5808         return -1;
5809     }
5810     if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
5811         trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
5812         return -1;
5813     }
5814     if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
5815         trace_pci_nvme_err_startfail_page_too_small(
5816                     NVME_CC_MPS(cc),
5817                     NVME_CAP_MPSMIN(cap));
5818         return -1;
5819     }
5820     if (unlikely(NVME_CC_MPS(cc) >
5821                  NVME_CAP_MPSMAX(cap))) {
5822         trace_pci_nvme_err_startfail_page_too_large(
5823                     NVME_CC_MPS(cc),
5824                     NVME_CAP_MPSMAX(cap));
5825         return -1;
5826     }
5827     if (unlikely(NVME_CC_IOCQES(cc) <
5828                  NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
5829         trace_pci_nvme_err_startfail_cqent_too_small(
5830                     NVME_CC_IOCQES(cc),
5831                     NVME_CTRL_CQES_MIN(cap));
5832         return -1;
5833     }
5834     if (unlikely(NVME_CC_IOCQES(cc) >
5835                  NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
5836         trace_pci_nvme_err_startfail_cqent_too_large(
5837                     NVME_CC_IOCQES(cc),
5838                     NVME_CTRL_CQES_MAX(cap));
5839         return -1;
5840     }
5841     if (unlikely(NVME_CC_IOSQES(cc) <
5842                  NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
5843         trace_pci_nvme_err_startfail_sqent_too_small(
5844                     NVME_CC_IOSQES(cc),
5845                     NVME_CTRL_SQES_MIN(cap));
5846         return -1;
5847     }
5848     if (unlikely(NVME_CC_IOSQES(cc) >
5849                  NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
5850         trace_pci_nvme_err_startfail_sqent_too_large(
5851                     NVME_CC_IOSQES(cc),
5852                     NVME_CTRL_SQES_MAX(cap));
5853         return -1;
5854     }
5855     if (unlikely(!NVME_AQA_ASQS(aqa))) {
5856         trace_pci_nvme_err_startfail_asqent_sz_zero();
5857         return -1;
5858     }
5859     if (unlikely(!NVME_AQA_ACQS(aqa))) {
5860         trace_pci_nvme_err_startfail_acqent_sz_zero();
5861         return -1;
5862     }
5863 
5864     n->page_bits = page_bits;
5865     n->page_size = page_size;
5866     n->max_prp_ents = n->page_size / sizeof(uint64_t);
5867     n->cqe_size = 1 << NVME_CC_IOCQES(cc);
5868     n->sqe_size = 1 << NVME_CC_IOSQES(cc);
5869     nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
5870     nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
5871 
5872     nvme_set_timestamp(n, 0ULL);
5873 
5874     QTAILQ_INIT(&n->aer_queue);
5875 
5876     nvme_select_iocs(n);
5877 
5878     return 0;
5879 }
5880 
5881 static void nvme_cmb_enable_regs(NvmeCtrl *n)
5882 {
5883     uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
5884     uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
5885 
5886     NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
5887     NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
5888     NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
5889     stl_le_p(&n->bar.cmbloc, cmbloc);
5890 
5891     NVME_CMBSZ_SET_SQS(cmbsz, 1);
5892     NVME_CMBSZ_SET_CQS(cmbsz, 0);
5893     NVME_CMBSZ_SET_LISTS(cmbsz, 1);
5894     NVME_CMBSZ_SET_RDS(cmbsz, 1);
5895     NVME_CMBSZ_SET_WDS(cmbsz, 1);
5896     NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
5897     NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
5898     stl_le_p(&n->bar.cmbsz, cmbsz);
5899 }
5900 
5901 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
5902                            unsigned size)
5903 {
5904     uint64_t cap = ldq_le_p(&n->bar.cap);
5905     uint32_t cc = ldl_le_p(&n->bar.cc);
5906     uint32_t intms = ldl_le_p(&n->bar.intms);
5907     uint32_t csts = ldl_le_p(&n->bar.csts);
5908     uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
5909 
5910     if (unlikely(offset & (sizeof(uint32_t) - 1))) {
5911         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
5912                        "MMIO write not 32-bit aligned,"
5913                        " offset=0x%"PRIx64"", offset);
5914         /* should be ignored, fall through for now */
5915     }
5916 
5917     if (unlikely(size < sizeof(uint32_t))) {
5918         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
5919                        "MMIO write smaller than 32-bits,"
5920                        " offset=0x%"PRIx64", size=%u",
5921                        offset, size);
5922         /* should be ignored, fall through for now */
5923     }
5924 
5925     switch (offset) {
5926     case NVME_REG_INTMS:
5927         if (unlikely(msix_enabled(&(n->parent_obj)))) {
5928             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5929                            "undefined access to interrupt mask set"
5930                            " when MSI-X is enabled");
5931             /* should be ignored, fall through for now */
5932         }
5933         intms |= data;
5934         stl_le_p(&n->bar.intms, intms);
5935         n->bar.intmc = n->bar.intms;
5936         trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
5937         nvme_irq_check(n);
5938         break;
5939     case NVME_REG_INTMC:
5940         if (unlikely(msix_enabled(&(n->parent_obj)))) {
5941             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5942                            "undefined access to interrupt mask clr"
5943                            " when MSI-X is enabled");
5944             /* should be ignored, fall through for now */
5945         }
5946         intms &= ~data;
5947         stl_le_p(&n->bar.intms, intms);
5948         n->bar.intmc = n->bar.intms;
5949         trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
5950         nvme_irq_check(n);
5951         break;
5952     case NVME_REG_CC:
5953         trace_pci_nvme_mmio_cfg(data & 0xffffffff);
5954 
5955         /* Windows first sends data, then sends enable bit */
5956         if (!NVME_CC_EN(data) && !NVME_CC_EN(cc) &&
5957             !NVME_CC_SHN(data) && !NVME_CC_SHN(cc))
5958         {
5959             cc = data;
5960         }
5961 
5962         if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
5963             cc = data;
5964 
5965             /* flush CC since nvme_start_ctrl() needs the value */
5966             stl_le_p(&n->bar.cc, cc);
5967             if (unlikely(nvme_start_ctrl(n))) {
5968                 trace_pci_nvme_err_startfail();
5969                 csts = NVME_CSTS_FAILED;
5970             } else {
5971                 trace_pci_nvme_mmio_start_success();
5972                 csts = NVME_CSTS_READY;
5973             }
5974         } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
5975             trace_pci_nvme_mmio_stopped();
5976             nvme_ctrl_reset(n);
5977             cc = 0;
5978             csts &= ~NVME_CSTS_READY;
5979         }
5980 
5981         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
5982             trace_pci_nvme_mmio_shutdown_set();
5983             nvme_ctrl_shutdown(n);
5984             cc = data;
5985             csts |= NVME_CSTS_SHST_COMPLETE;
5986         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
5987             trace_pci_nvme_mmio_shutdown_cleared();
5988             csts &= ~NVME_CSTS_SHST_COMPLETE;
5989             cc = data;
5990         }
5991 
5992         stl_le_p(&n->bar.cc, cc);
5993         stl_le_p(&n->bar.csts, csts);
5994 
5995         break;
5996     case NVME_REG_CSTS:
5997         if (data & (1 << 4)) {
5998             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
5999                            "attempted to W1C CSTS.NSSRO"
6000                            " but CAP.NSSRS is zero (not supported)");
6001         } else if (data != 0) {
6002             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
6003                            "attempted to set a read only bit"
6004                            " of controller status");
6005         }
6006         break;
6007     case NVME_REG_NSSR:
6008         if (data == 0x4e564d65) {
6009             trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
6010         } else {
6011             /* The spec says that writes of other values have no effect */
6012             return;
6013         }
6014         break;
6015     case NVME_REG_AQA:
6016         stl_le_p(&n->bar.aqa, data);
6017         trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
6018         break;
6019     case NVME_REG_ASQ:
6020         stn_le_p(&n->bar.asq, size, data);
6021         trace_pci_nvme_mmio_asqaddr(data);
6022         break;
6023     case NVME_REG_ASQ + 4:
6024         stl_le_p((uint8_t *)&n->bar.asq + 4, data);
6025         trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
6026         break;
6027     case NVME_REG_ACQ:
6028         trace_pci_nvme_mmio_acqaddr(data);
6029         stn_le_p(&n->bar.acq, size, data);
6030         break;
6031     case NVME_REG_ACQ + 4:
6032         stl_le_p((uint8_t *)&n->bar.acq + 4, data);
6033         trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
6034         break;
6035     case NVME_REG_CMBLOC:
6036         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
6037                        "invalid write to reserved CMBLOC"
6038                        " when CMBSZ is zero, ignored");
6039         return;
6040     case NVME_REG_CMBSZ:
6041         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
6042                        "invalid write to read only CMBSZ, ignored");
6043         return;
6044     case NVME_REG_CMBMSC:
6045         if (!NVME_CAP_CMBS(cap)) {
6046             return;
6047         }
6048 
6049         stn_le_p(&n->bar.cmbmsc, size, data);
6050         n->cmb.cmse = false;
6051 
6052         if (NVME_CMBMSC_CRE(data)) {
6053             nvme_cmb_enable_regs(n);
6054 
6055             if (NVME_CMBMSC_CMSE(data)) {
6056                 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
6057                 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
6058                 if (cba + int128_get64(n->cmb.mem.size) < cba) {
6059                     uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
6060                     NVME_CMBSTS_SET_CBAI(cmbsts, 1);
6061                     stl_le_p(&n->bar.cmbsts, cmbsts);
6062                     return;
6063                 }
6064 
6065                 n->cmb.cba = cba;
6066                 n->cmb.cmse = true;
6067             }
6068         } else {
6069             n->bar.cmbsz = 0;
6070             n->bar.cmbloc = 0;
6071         }
6072 
6073         return;
6074     case NVME_REG_CMBMSC + 4:
6075         stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
6076         return;
6077 
6078     case NVME_REG_PMRCAP:
6079         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
6080                        "invalid write to PMRCAP register, ignored");
6081         return;
6082     case NVME_REG_PMRCTL:
6083         if (!NVME_CAP_PMRS(cap)) {
6084             return;
6085         }
6086 
6087         stl_le_p(&n->bar.pmrctl, data);
6088         if (NVME_PMRCTL_EN(data)) {
6089             memory_region_set_enabled(&n->pmr.dev->mr, true);
6090             pmrsts = 0;
6091         } else {
6092             memory_region_set_enabled(&n->pmr.dev->mr, false);
6093             NVME_PMRSTS_SET_NRDY(pmrsts, 1);
6094             n->pmr.cmse = false;
6095         }
6096         stl_le_p(&n->bar.pmrsts, pmrsts);
6097         return;
6098     case NVME_REG_PMRSTS:
6099         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
6100                        "invalid write to PMRSTS register, ignored");
6101         return;
6102     case NVME_REG_PMREBS:
6103         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
6104                        "invalid write to PMREBS register, ignored");
6105         return;
6106     case NVME_REG_PMRSWTP:
6107         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
6108                        "invalid write to PMRSWTP register, ignored");
6109         return;
6110     case NVME_REG_PMRMSCL:
6111         if (!NVME_CAP_PMRS(cap)) {
6112             return;
6113         }
6114 
6115         stl_le_p(&n->bar.pmrmscl, data);
6116         n->pmr.cmse = false;
6117 
6118         if (NVME_PMRMSCL_CMSE(data)) {
6119             uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
6120             hwaddr cba = pmrmscu << 32 |
6121                 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
6122             if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
6123                 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
6124                 stl_le_p(&n->bar.pmrsts, pmrsts);
6125                 return;
6126             }
6127 
6128             n->pmr.cmse = true;
6129             n->pmr.cba = cba;
6130         }
6131 
6132         return;
6133     case NVME_REG_PMRMSCU:
6134         if (!NVME_CAP_PMRS(cap)) {
6135             return;
6136         }
6137 
6138         stl_le_p(&n->bar.pmrmscu, data);
6139         return;
6140     default:
6141         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
6142                        "invalid MMIO write,"
6143                        " offset=0x%"PRIx64", data=%"PRIx64"",
6144                        offset, data);
6145         break;
6146     }
6147 }
6148 
6149 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
6150 {
6151     NvmeCtrl *n = (NvmeCtrl *)opaque;
6152     uint8_t *ptr = (uint8_t *)&n->bar;
6153 
6154     trace_pci_nvme_mmio_read(addr, size);
6155 
6156     if (unlikely(addr & (sizeof(uint32_t) - 1))) {
6157         NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
6158                        "MMIO read not 32-bit aligned,"
6159                        " offset=0x%"PRIx64"", addr);
6160         /* should RAZ, fall through for now */
6161     } else if (unlikely(size < sizeof(uint32_t))) {
6162         NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
6163                        "MMIO read smaller than 32-bits,"
6164                        " offset=0x%"PRIx64"", addr);
6165         /* should RAZ, fall through for now */
6166     }
6167 
6168     if (addr > sizeof(n->bar) - size) {
6169         NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
6170                        "MMIO read beyond last register,"
6171                        " offset=0x%"PRIx64", returning 0", addr);
6172 
6173         return 0;
6174     }
6175 
6176     /*
6177      * When PMRWBM bit 1 is set then read from
6178      * from PMRSTS should ensure prior writes
6179      * made it to persistent media
6180      */
6181     if (addr == NVME_REG_PMRSTS &&
6182         (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
6183         memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
6184     }
6185 
6186     return ldn_le_p(ptr + addr, size);
6187 }
6188 
6189 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
6190 {
6191     uint32_t qid;
6192 
6193     if (unlikely(addr & ((1 << 2) - 1))) {
6194         NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
6195                        "doorbell write not 32-bit aligned,"
6196                        " offset=0x%"PRIx64", ignoring", addr);
6197         return;
6198     }
6199 
6200     if (((addr - 0x1000) >> 2) & 1) {
6201         /* Completion queue doorbell write */
6202 
6203         uint16_t new_head = val & 0xffff;
6204         int start_sqs;
6205         NvmeCQueue *cq;
6206 
6207         qid = (addr - (0x1000 + (1 << 2))) >> 3;
6208         if (unlikely(nvme_check_cqid(n, qid))) {
6209             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
6210                            "completion queue doorbell write"
6211                            " for nonexistent queue,"
6212                            " sqid=%"PRIu32", ignoring", qid);
6213 
6214             /*
6215              * NVM Express v1.3d, Section 4.1 state: "If host software writes
6216              * an invalid value to the Submission Queue Tail Doorbell or
6217              * Completion Queue Head Doorbell regiter and an Asynchronous Event
6218              * Request command is outstanding, then an asynchronous event is
6219              * posted to the Admin Completion Queue with a status code of
6220              * Invalid Doorbell Write Value."
6221              *
6222              * Also note that the spec includes the "Invalid Doorbell Register"
6223              * status code, but nowhere does it specify when to use it.
6224              * However, it seems reasonable to use it here in a similar
6225              * fashion.
6226              */
6227             if (n->outstanding_aers) {
6228                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6229                                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6230                                    NVME_LOG_ERROR_INFO);
6231             }
6232 
6233             return;
6234         }
6235 
6236         cq = n->cq[qid];
6237         if (unlikely(new_head >= cq->size)) {
6238             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
6239                            "completion queue doorbell write value"
6240                            " beyond queue size, sqid=%"PRIu32","
6241                            " new_head=%"PRIu16", ignoring",
6242                            qid, new_head);
6243 
6244             if (n->outstanding_aers) {
6245                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6246                                    NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6247                                    NVME_LOG_ERROR_INFO);
6248             }
6249 
6250             return;
6251         }
6252 
6253         trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
6254 
6255         start_sqs = nvme_cq_full(cq) ? 1 : 0;
6256         cq->head = new_head;
6257         if (start_sqs) {
6258             NvmeSQueue *sq;
6259             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
6260                 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6261             }
6262             timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6263         }
6264 
6265         if (cq->tail == cq->head) {
6266             if (cq->irq_enabled) {
6267                 n->cq_pending--;
6268             }
6269 
6270             nvme_irq_deassert(n, cq);
6271         }
6272     } else {
6273         /* Submission queue doorbell write */
6274 
6275         uint16_t new_tail = val & 0xffff;
6276         NvmeSQueue *sq;
6277 
6278         qid = (addr - 0x1000) >> 3;
6279         if (unlikely(nvme_check_sqid(n, qid))) {
6280             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
6281                            "submission queue doorbell write"
6282                            " for nonexistent queue,"
6283                            " sqid=%"PRIu32", ignoring", qid);
6284 
6285             if (n->outstanding_aers) {
6286                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6287                                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6288                                    NVME_LOG_ERROR_INFO);
6289             }
6290 
6291             return;
6292         }
6293 
6294         sq = n->sq[qid];
6295         if (unlikely(new_tail >= sq->size)) {
6296             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
6297                            "submission queue doorbell write value"
6298                            " beyond queue size, sqid=%"PRIu32","
6299                            " new_tail=%"PRIu16", ignoring",
6300                            qid, new_tail);
6301 
6302             if (n->outstanding_aers) {
6303                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6304                                    NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6305                                    NVME_LOG_ERROR_INFO);
6306             }
6307 
6308             return;
6309         }
6310 
6311         trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
6312 
6313         sq->tail = new_tail;
6314         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6315     }
6316 }
6317 
6318 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
6319                             unsigned size)
6320 {
6321     NvmeCtrl *n = (NvmeCtrl *)opaque;
6322 
6323     trace_pci_nvme_mmio_write(addr, data, size);
6324 
6325     if (addr < sizeof(n->bar)) {
6326         nvme_write_bar(n, addr, data, size);
6327     } else {
6328         nvme_process_db(n, addr, data);
6329     }
6330 }
6331 
6332 static const MemoryRegionOps nvme_mmio_ops = {
6333     .read = nvme_mmio_read,
6334     .write = nvme_mmio_write,
6335     .endianness = DEVICE_LITTLE_ENDIAN,
6336     .impl = {
6337         .min_access_size = 2,
6338         .max_access_size = 8,
6339     },
6340 };
6341 
6342 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
6343                            unsigned size)
6344 {
6345     NvmeCtrl *n = (NvmeCtrl *)opaque;
6346     stn_le_p(&n->cmb.buf[addr], size, data);
6347 }
6348 
6349 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
6350 {
6351     NvmeCtrl *n = (NvmeCtrl *)opaque;
6352     return ldn_le_p(&n->cmb.buf[addr], size);
6353 }
6354 
6355 static const MemoryRegionOps nvme_cmb_ops = {
6356     .read = nvme_cmb_read,
6357     .write = nvme_cmb_write,
6358     .endianness = DEVICE_LITTLE_ENDIAN,
6359     .impl = {
6360         .min_access_size = 1,
6361         .max_access_size = 8,
6362     },
6363 };
6364 
6365 static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
6366 {
6367     NvmeParams *params = &n->params;
6368 
6369     if (params->num_queues) {
6370         warn_report("num_queues is deprecated; please use max_ioqpairs "
6371                     "instead");
6372 
6373         params->max_ioqpairs = params->num_queues - 1;
6374     }
6375 
6376     if (n->namespace.blkconf.blk && n->subsys) {
6377         error_setg(errp, "subsystem support is unavailable with legacy "
6378                    "namespace ('drive' property)");
6379         return;
6380     }
6381 
6382     if (params->max_ioqpairs < 1 ||
6383         params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
6384         error_setg(errp, "max_ioqpairs must be between 1 and %d",
6385                    NVME_MAX_IOQPAIRS);
6386         return;
6387     }
6388 
6389     if (params->msix_qsize < 1 ||
6390         params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
6391         error_setg(errp, "msix_qsize must be between 1 and %d",
6392                    PCI_MSIX_FLAGS_QSIZE + 1);
6393         return;
6394     }
6395 
6396     if (!params->serial) {
6397         error_setg(errp, "serial property not set");
6398         return;
6399     }
6400 
6401     if (n->pmr.dev) {
6402         if (host_memory_backend_is_mapped(n->pmr.dev)) {
6403             error_setg(errp, "can't use already busy memdev: %s",
6404                        object_get_canonical_path_component(OBJECT(n->pmr.dev)));
6405             return;
6406         }
6407 
6408         if (!is_power_of_2(n->pmr.dev->size)) {
6409             error_setg(errp, "pmr backend size needs to be power of 2 in size");
6410             return;
6411         }
6412 
6413         host_memory_backend_set_mapped(n->pmr.dev, true);
6414     }
6415 
6416     if (n->params.zasl > n->params.mdts) {
6417         error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
6418                    "than or equal to mdts (Maximum Data Transfer Size)");
6419         return;
6420     }
6421 
6422     if (!n->params.vsl) {
6423         error_setg(errp, "vsl must be non-zero");
6424         return;
6425     }
6426 }
6427 
6428 static void nvme_init_state(NvmeCtrl *n)
6429 {
6430     /* add one to max_ioqpairs to account for the admin queue pair */
6431     n->reg_size = pow2ceil(sizeof(NvmeBar) +
6432                            2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
6433     n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
6434     n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
6435     n->temperature = NVME_TEMPERATURE;
6436     n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
6437     n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6438     n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
6439 }
6440 
6441 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
6442 {
6443     uint64_t cmb_size = n->params.cmb_size_mb * MiB;
6444     uint64_t cap = ldq_le_p(&n->bar.cap);
6445 
6446     n->cmb.buf = g_malloc0(cmb_size);
6447     memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
6448                           "nvme-cmb", cmb_size);
6449     pci_register_bar(pci_dev, NVME_CMB_BIR,
6450                      PCI_BASE_ADDRESS_SPACE_MEMORY |
6451                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
6452                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
6453 
6454     NVME_CAP_SET_CMBS(cap, 1);
6455     stq_le_p(&n->bar.cap, cap);
6456 
6457     if (n->params.legacy_cmb) {
6458         nvme_cmb_enable_regs(n);
6459         n->cmb.cmse = true;
6460     }
6461 }
6462 
6463 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
6464 {
6465     uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
6466 
6467     NVME_PMRCAP_SET_RDS(pmrcap, 1);
6468     NVME_PMRCAP_SET_WDS(pmrcap, 1);
6469     NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
6470     /* Turn on bit 1 support */
6471     NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
6472     NVME_PMRCAP_SET_CMSS(pmrcap, 1);
6473     stl_le_p(&n->bar.pmrcap, pmrcap);
6474 
6475     pci_register_bar(pci_dev, NVME_PMR_BIR,
6476                      PCI_BASE_ADDRESS_SPACE_MEMORY |
6477                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
6478                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
6479 
6480     memory_region_set_enabled(&n->pmr.dev->mr, false);
6481 }
6482 
6483 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
6484 {
6485     uint8_t *pci_conf = pci_dev->config;
6486     uint64_t bar_size, msix_table_size, msix_pba_size;
6487     unsigned msix_table_offset, msix_pba_offset;
6488     int ret;
6489 
6490     Error *err = NULL;
6491 
6492     pci_conf[PCI_INTERRUPT_PIN] = 1;
6493     pci_config_set_prog_interface(pci_conf, 0x2);
6494 
6495     if (n->params.use_intel_id) {
6496         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
6497         pci_config_set_device_id(pci_conf, 0x5845);
6498     } else {
6499         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
6500         pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
6501     }
6502 
6503     pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
6504     pcie_endpoint_cap_init(pci_dev, 0x80);
6505 
6506     bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
6507     msix_table_offset = bar_size;
6508     msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
6509 
6510     bar_size += msix_table_size;
6511     bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
6512     msix_pba_offset = bar_size;
6513     msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
6514 
6515     bar_size += msix_pba_size;
6516     bar_size = pow2ceil(bar_size);
6517 
6518     memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
6519     memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
6520                           n->reg_size);
6521     memory_region_add_subregion(&n->bar0, 0, &n->iomem);
6522 
6523     pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
6524                      PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
6525     ret = msix_init(pci_dev, n->params.msix_qsize,
6526                     &n->bar0, 0, msix_table_offset,
6527                     &n->bar0, 0, msix_pba_offset, 0, &err);
6528     if (ret < 0) {
6529         if (ret == -ENOTSUP) {
6530             warn_report_err(err);
6531         } else {
6532             error_propagate(errp, err);
6533             return ret;
6534         }
6535     }
6536 
6537     if (n->params.cmb_size_mb) {
6538         nvme_init_cmb(n, pci_dev);
6539     }
6540 
6541     if (n->pmr.dev) {
6542         nvme_init_pmr(n, pci_dev);
6543     }
6544 
6545     return 0;
6546 }
6547 
6548 static void nvme_init_subnqn(NvmeCtrl *n)
6549 {
6550     NvmeSubsystem *subsys = n->subsys;
6551     NvmeIdCtrl *id = &n->id_ctrl;
6552 
6553     if (!subsys) {
6554         snprintf((char *)id->subnqn, sizeof(id->subnqn),
6555                  "nqn.2019-08.org.qemu:%s", n->params.serial);
6556     } else {
6557         pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
6558     }
6559 }
6560 
6561 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
6562 {
6563     NvmeIdCtrl *id = &n->id_ctrl;
6564     uint8_t *pci_conf = pci_dev->config;
6565     uint64_t cap = ldq_le_p(&n->bar.cap);
6566 
6567     id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
6568     id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
6569     strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
6570     strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
6571     strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
6572 
6573     id->cntlid = cpu_to_le16(n->cntlid);
6574 
6575     id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
6576 
6577     id->rab = 6;
6578 
6579     if (n->params.use_intel_id) {
6580         id->ieee[0] = 0xb3;
6581         id->ieee[1] = 0x02;
6582         id->ieee[2] = 0x00;
6583     } else {
6584         id->ieee[0] = 0x00;
6585         id->ieee[1] = 0x54;
6586         id->ieee[2] = 0x52;
6587     }
6588 
6589     id->mdts = n->params.mdts;
6590     id->ver = cpu_to_le32(NVME_SPEC_VER);
6591     id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
6592     id->cntrltype = 0x1;
6593 
6594     /*
6595      * Because the controller always completes the Abort command immediately,
6596      * there can never be more than one concurrently executing Abort command,
6597      * so this value is never used for anything. Note that there can easily be
6598      * many Abort commands in the queues, but they are not considered
6599      * "executing" until processed by nvme_abort.
6600      *
6601      * The specification recommends a value of 3 for Abort Command Limit (four
6602      * concurrently outstanding Abort commands), so lets use that though it is
6603      * inconsequential.
6604      */
6605     id->acl = 3;
6606     id->aerl = n->params.aerl;
6607     id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
6608     id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
6609 
6610     /* recommended default value (~70 C) */
6611     id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
6612     id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
6613 
6614     id->sqes = (0x6 << 4) | 0x6;
6615     id->cqes = (0x4 << 4) | 0x4;
6616     id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
6617     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
6618                            NVME_ONCS_FEATURES | NVME_ONCS_DSM |
6619                            NVME_ONCS_COMPARE | NVME_ONCS_COPY);
6620 
6621     /*
6622      * NOTE: If this device ever supports a command set that does NOT use 0x0
6623      * as a Flush-equivalent operation, support for the broadcast NSID in Flush
6624      * should probably be removed.
6625      *
6626      * See comment in nvme_io_cmd.
6627      */
6628     id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
6629 
6630     id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
6631     id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
6632                            NVME_CTRL_SGLS_BITBUCKET);
6633 
6634     nvme_init_subnqn(n);
6635 
6636     id->psd[0].mp = cpu_to_le16(0x9c4);
6637     id->psd[0].enlat = cpu_to_le32(0x10);
6638     id->psd[0].exlat = cpu_to_le32(0x4);
6639 
6640     if (n->subsys) {
6641         id->cmic |= NVME_CMIC_MULTI_CTRL;
6642     }
6643 
6644     NVME_CAP_SET_MQES(cap, 0x7ff);
6645     NVME_CAP_SET_CQR(cap, 1);
6646     NVME_CAP_SET_TO(cap, 0xf);
6647     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
6648     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
6649     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
6650     NVME_CAP_SET_MPSMAX(cap, 4);
6651     NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
6652     NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
6653     stq_le_p(&n->bar.cap, cap);
6654 
6655     stl_le_p(&n->bar.vs, NVME_SPEC_VER);
6656     n->bar.intmc = n->bar.intms = 0;
6657 }
6658 
6659 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
6660 {
6661     int cntlid;
6662 
6663     if (!n->subsys) {
6664         return 0;
6665     }
6666 
6667     cntlid = nvme_subsys_register_ctrl(n, errp);
6668     if (cntlid < 0) {
6669         return -1;
6670     }
6671 
6672     n->cntlid = cntlid;
6673 
6674     return 0;
6675 }
6676 
6677 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6678 {
6679     uint32_t nsid = ns->params.nsid;
6680     assert(nsid && nsid <= NVME_MAX_NAMESPACES);
6681 
6682     n->namespaces[nsid] = ns;
6683     ns->attached++;
6684 
6685     n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6686                             BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6687 }
6688 
6689 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
6690 {
6691     NvmeCtrl *n = NVME(pci_dev);
6692     NvmeNamespace *ns;
6693     Error *local_err = NULL;
6694 
6695     nvme_check_constraints(n, &local_err);
6696     if (local_err) {
6697         error_propagate(errp, local_err);
6698         return;
6699     }
6700 
6701     qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
6702               &pci_dev->qdev, n->parent_obj.qdev.id);
6703 
6704     nvme_init_state(n);
6705     if (nvme_init_pci(n, pci_dev, errp)) {
6706         return;
6707     }
6708 
6709     if (nvme_init_subsys(n, errp)) {
6710         error_propagate(errp, local_err);
6711         return;
6712     }
6713     nvme_init_ctrl(n, pci_dev);
6714 
6715     /* setup a namespace if the controller drive property was given */
6716     if (n->namespace.blkconf.blk) {
6717         ns = &n->namespace;
6718         ns->params.nsid = 1;
6719 
6720         if (nvme_ns_setup(ns, errp)) {
6721             return;
6722         }
6723 
6724         nvme_attach_ns(n, ns);
6725     }
6726 }
6727 
6728 static void nvme_exit(PCIDevice *pci_dev)
6729 {
6730     NvmeCtrl *n = NVME(pci_dev);
6731     NvmeNamespace *ns;
6732     int i;
6733 
6734     nvme_ctrl_reset(n);
6735 
6736     if (n->subsys) {
6737         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6738             ns = nvme_ns(n, i);
6739             if (ns) {
6740                 ns->attached--;
6741             }
6742         }
6743 
6744         nvme_subsys_unregister_ctrl(n->subsys, n);
6745     }
6746 
6747     g_free(n->cq);
6748     g_free(n->sq);
6749     g_free(n->aer_reqs);
6750 
6751     if (n->params.cmb_size_mb) {
6752         g_free(n->cmb.buf);
6753     }
6754 
6755     if (n->pmr.dev) {
6756         host_memory_backend_set_mapped(n->pmr.dev, false);
6757     }
6758     msix_uninit(pci_dev, &n->bar0, &n->bar0);
6759     memory_region_del_subregion(&n->bar0, &n->iomem);
6760 }
6761 
6762 static Property nvme_props[] = {
6763     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
6764     DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
6765                      HostMemoryBackend *),
6766     DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
6767                      NvmeSubsystem *),
6768     DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
6769     DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
6770     DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
6771     DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6772     DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
6773     DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
6774     DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
6775     DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
6776     DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
6777     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
6778     DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
6779     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
6780     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
6781                      params.auto_transition_zones, true),
6782     DEFINE_PROP_END_OF_LIST(),
6783 };
6784 
6785 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
6786                                    void *opaque, Error **errp)
6787 {
6788     NvmeCtrl *n = NVME(obj);
6789     uint8_t value = n->smart_critical_warning;
6790 
6791     visit_type_uint8(v, name, &value, errp);
6792 }
6793 
6794 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
6795                                    void *opaque, Error **errp)
6796 {
6797     NvmeCtrl *n = NVME(obj);
6798     uint8_t value, old_value, cap = 0, index, event;
6799 
6800     if (!visit_type_uint8(v, name, &value, errp)) {
6801         return;
6802     }
6803 
6804     cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
6805           | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
6806     if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
6807         cap |= NVME_SMART_PMR_UNRELIABLE;
6808     }
6809 
6810     if ((value & cap) != value) {
6811         error_setg(errp, "unsupported smart critical warning bits: 0x%x",
6812                    value & ~cap);
6813         return;
6814     }
6815 
6816     old_value = n->smart_critical_warning;
6817     n->smart_critical_warning = value;
6818 
6819     /* only inject new bits of smart critical warning */
6820     for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
6821         event = 1 << index;
6822         if (value & ~old_value & event)
6823             nvme_smart_event(n, event);
6824     }
6825 }
6826 
6827 static const VMStateDescription nvme_vmstate = {
6828     .name = "nvme",
6829     .unmigratable = 1,
6830 };
6831 
6832 static void nvme_class_init(ObjectClass *oc, void *data)
6833 {
6834     DeviceClass *dc = DEVICE_CLASS(oc);
6835     PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
6836 
6837     pc->realize = nvme_realize;
6838     pc->exit = nvme_exit;
6839     pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
6840     pc->revision = 2;
6841 
6842     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
6843     dc->desc = "Non-Volatile Memory Express";
6844     device_class_set_props(dc, nvme_props);
6845     dc->vmsd = &nvme_vmstate;
6846 }
6847 
6848 static void nvme_instance_init(Object *obj)
6849 {
6850     NvmeCtrl *n = NVME(obj);
6851 
6852     device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
6853                                   "bootindex", "/namespace@1,0",
6854                                   DEVICE(obj));
6855 
6856     object_property_add(obj, "smart_critical_warning", "uint8",
6857                         nvme_get_smart_warning,
6858                         nvme_set_smart_warning, NULL, NULL);
6859 }
6860 
6861 static const TypeInfo nvme_info = {
6862     .name          = TYPE_NVME,
6863     .parent        = TYPE_PCI_DEVICE,
6864     .instance_size = sizeof(NvmeCtrl),
6865     .instance_init = nvme_instance_init,
6866     .class_init    = nvme_class_init,
6867     .interfaces = (InterfaceInfo[]) {
6868         { INTERFACE_PCIE_DEVICE },
6869         { }
6870     },
6871 };
6872 
6873 static const TypeInfo nvme_bus_info = {
6874     .name = TYPE_NVME_BUS,
6875     .parent = TYPE_BUS,
6876     .instance_size = sizeof(NvmeBus),
6877 };
6878 
6879 static void nvme_register_types(void)
6880 {
6881     type_register_static(&nvme_info);
6882     type_register_static(&nvme_bus_info);
6883 }
6884 
6885 type_init(nvme_register_types)
6886