xref: /openbmc/qemu/hw/nvme/ctrl.c (revision 2f44bea9)
1 /*
2  * QEMU NVM Express Controller
3  *
4  * Copyright (c) 2012, Intel Corporation
5  *
6  * Written by Keith Busch <keith.busch@intel.com>
7  *
8  * This code is licensed under the GNU GPL v2 or later.
9  */
10 
11 /**
12  * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13  *
14  *  https://nvmexpress.org/developers/nvme-specification/
15  *
16  *
17  * Notes on coding style
18  * ---------------------
19  * While QEMU coding style prefers lowercase hexadecimals in constants, the
20  * NVMe subsystem use thes format from the NVMe specifications in the comments
21  * (i.e. 'h' suffix instead of '0x' prefix).
22  *
23  * Usage
24  * -----
25  * See docs/system/nvme.rst for extensive documentation.
26  *
27  * Add options:
28  *      -drive file=<file>,if=none,id=<drive_id>
29  *      -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30  *      -device nvme,serial=<serial>,id=<bus_name>, \
31  *              cmb_size_mb=<cmb_size_mb[optional]>, \
32  *              [pmrdev=<mem_backend_file_id>,] \
33  *              max_ioqpairs=<N[optional]>, \
34  *              aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35  *              mdts=<N[optional]>,vsl=<N[optional]>, \
36  *              zoned.zasl=<N[optional]>, \
37  *              zoned.auto_transition=<on|off[optional]>, \
38  *              subsys=<subsys_id>
39  *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
40  *              zoned=<true|false[optional]>, \
41  *              subsys=<subsys_id>,detached=<true|false[optional]>
42  *
43  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
44  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
45  * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
46  * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
47  *
48  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
49  * For example:
50  * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
51  *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
52  *
53  * The PMR will use BAR 4/5 exclusively.
54  *
55  * To place controller(s) and namespace(s) to a subsystem, then provide
56  * nvme-subsys device as above.
57  *
58  * nvme subsystem device parameters
59  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
60  * - `nqn`
61  *   This parameter provides the `<nqn_id>` part of the string
62  *   `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
63  *   of subsystem controllers. Note that `<nqn_id>` should be unique per
64  *   subsystem, but this is not enforced by QEMU. If not specified, it will
65  *   default to the value of the `id` parameter (`<subsys_id>`).
66  *
67  * nvme device parameters
68  * ~~~~~~~~~~~~~~~~~~~~~~
69  * - `subsys`
70  *   Specifying this parameter attaches the controller to the subsystem and
71  *   the SUBNQN field in the controller will report the NQN of the subsystem
72  *   device. This also enables multi controller capability represented in
73  *   Identify Controller data structure in CMIC (Controller Multi-path I/O and
74  *   Namesapce Sharing Capabilities).
75  *
76  * - `aerl`
77  *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
78  *   of concurrently outstanding Asynchronous Event Request commands support
79  *   by the controller. This is a 0's based value.
80  *
81  * - `aer_max_queued`
82  *   This is the maximum number of events that the device will enqueue for
83  *   completion when there are no outstanding AERs. When the maximum number of
84  *   enqueued events are reached, subsequent events will be dropped.
85  *
86  * - `mdts`
87  *   Indicates the maximum data transfer size for a command that transfers data
88  *   between host-accessible memory and the controller. The value is specified
89  *   as a power of two (2^n) and is in units of the minimum memory page size
90  *   (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
91  *
92  * - `vsl`
93  *   Indicates the maximum data size limit for the Verify command. Like `mdts`,
94  *   this value is specified as a power of two (2^n) and is in units of the
95  *   minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
96  *   KiB).
97  *
98  * - `zoned.zasl`
99  *   Indicates the maximum data transfer size for the Zone Append command. Like
100  *   `mdts`, the value is specified as a power of two (2^n) and is in units of
101  *   the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
102  *   defaulting to the value of `mdts`).
103  *
104  * - `zoned.auto_transition`
105  *   Indicates if zones in zone state implicitly opened can be automatically
106  *   transitioned to zone state closed for resource management purposes.
107  *   Defaults to 'on'.
108  *
109  * nvme namespace device parameters
110  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
111  * - `shared`
112  *   When the parent nvme device (as defined explicitly by the 'bus' parameter
113  *   or implicitly by the most recently defined NvmeBus) is linked to an
114  *   nvme-subsys device, the namespace will be attached to all controllers in
115  *   the subsystem. If set to 'off' (the default), the namespace will remain a
116  *   private namespace and may only be attached to a single controller at a
117  *   time.
118  *
119  * - `detached`
120  *   This parameter is only valid together with the `subsys` parameter. If left
121  *   at the default value (`false/off`), the namespace will be attached to all
122  *   controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
123  *   namespace will be available in the subsystem but not attached to any
124  *   controllers.
125  *
126  * Setting `zoned` to true selects Zoned Command Set at the namespace.
127  * In this case, the following namespace properties are available to configure
128  * zoned operation:
129  *     zoned.zone_size=<zone size in bytes, default: 128MiB>
130  *         The number may be followed by K, M, G as in kilo-, mega- or giga-.
131  *
132  *     zoned.zone_capacity=<zone capacity in bytes, default: zone size>
133  *         The value 0 (default) forces zone capacity to be the same as zone
134  *         size. The value of this property may not exceed zone size.
135  *
136  *     zoned.descr_ext_size=<zone descriptor extension size, default 0>
137  *         This value needs to be specified in 64B units. If it is zero,
138  *         namespace(s) will not support zone descriptor extensions.
139  *
140  *     zoned.max_active=<Maximum Active Resources (zones), default: 0>
141  *         The default value means there is no limit to the number of
142  *         concurrently active zones.
143  *
144  *     zoned.max_open=<Maximum Open Resources (zones), default: 0>
145  *         The default value means there is no limit to the number of
146  *         concurrently open zones.
147  *
148  *     zoned.cross_read=<enable RAZB, default: false>
149  *         Setting this property to true enables Read Across Zone Boundaries.
150  */
151 
152 #include "qemu/osdep.h"
153 #include "qemu/cutils.h"
154 #include "qemu/error-report.h"
155 #include "qemu/log.h"
156 #include "qemu/units.h"
157 #include "qapi/error.h"
158 #include "qapi/visitor.h"
159 #include "sysemu/sysemu.h"
160 #include "sysemu/block-backend.h"
161 #include "sysemu/hostmem.h"
162 #include "hw/pci/msix.h"
163 #include "migration/vmstate.h"
164 
165 #include "nvme.h"
166 #include "trace.h"
167 
168 #define NVME_MAX_IOQPAIRS 0xffff
169 #define NVME_DB_SIZE  4
170 #define NVME_SPEC_VER 0x00010400
171 #define NVME_CMB_BIR 2
172 #define NVME_PMR_BIR 4
173 #define NVME_TEMPERATURE 0x143
174 #define NVME_TEMPERATURE_WARNING 0x157
175 #define NVME_TEMPERATURE_CRITICAL 0x175
176 #define NVME_NUM_FW_SLOTS 1
177 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
178 
179 #define NVME_GUEST_ERR(trace, fmt, ...) \
180     do { \
181         (trace_##trace)(__VA_ARGS__); \
182         qemu_log_mask(LOG_GUEST_ERROR, #trace \
183             " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
184     } while (0)
185 
186 static const bool nvme_feature_support[NVME_FID_MAX] = {
187     [NVME_ARBITRATION]              = true,
188     [NVME_POWER_MANAGEMENT]         = true,
189     [NVME_TEMPERATURE_THRESHOLD]    = true,
190     [NVME_ERROR_RECOVERY]           = true,
191     [NVME_VOLATILE_WRITE_CACHE]     = true,
192     [NVME_NUMBER_OF_QUEUES]         = true,
193     [NVME_INTERRUPT_COALESCING]     = true,
194     [NVME_INTERRUPT_VECTOR_CONF]    = true,
195     [NVME_WRITE_ATOMICITY]          = true,
196     [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
197     [NVME_TIMESTAMP]                = true,
198     [NVME_COMMAND_SET_PROFILE]      = true,
199 };
200 
201 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
202     [NVME_TEMPERATURE_THRESHOLD]    = NVME_FEAT_CAP_CHANGE,
203     [NVME_ERROR_RECOVERY]           = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
204     [NVME_VOLATILE_WRITE_CACHE]     = NVME_FEAT_CAP_CHANGE,
205     [NVME_NUMBER_OF_QUEUES]         = NVME_FEAT_CAP_CHANGE,
206     [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
207     [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
208     [NVME_COMMAND_SET_PROFILE]      = NVME_FEAT_CAP_CHANGE,
209 };
210 
211 static const uint32_t nvme_cse_acs[256] = {
212     [NVME_ADM_CMD_DELETE_SQ]        = NVME_CMD_EFF_CSUPP,
213     [NVME_ADM_CMD_CREATE_SQ]        = NVME_CMD_EFF_CSUPP,
214     [NVME_ADM_CMD_GET_LOG_PAGE]     = NVME_CMD_EFF_CSUPP,
215     [NVME_ADM_CMD_DELETE_CQ]        = NVME_CMD_EFF_CSUPP,
216     [NVME_ADM_CMD_CREATE_CQ]        = NVME_CMD_EFF_CSUPP,
217     [NVME_ADM_CMD_IDENTIFY]         = NVME_CMD_EFF_CSUPP,
218     [NVME_ADM_CMD_ABORT]            = NVME_CMD_EFF_CSUPP,
219     [NVME_ADM_CMD_SET_FEATURES]     = NVME_CMD_EFF_CSUPP,
220     [NVME_ADM_CMD_GET_FEATURES]     = NVME_CMD_EFF_CSUPP,
221     [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
222     [NVME_ADM_CMD_NS_ATTACHMENT]    = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
223     [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
224 };
225 
226 static const uint32_t nvme_cse_iocs_none[256];
227 
228 static const uint32_t nvme_cse_iocs_nvm[256] = {
229     [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
230     [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
231     [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
232     [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
233     [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
234     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
235     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
236     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
237 };
238 
239 static const uint32_t nvme_cse_iocs_zoned[256] = {
240     [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
241     [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
242     [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
243     [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
244     [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
245     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
246     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
247     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
248     [NVME_CMD_ZONE_APPEND]          = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
249     [NVME_CMD_ZONE_MGMT_SEND]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
250     [NVME_CMD_ZONE_MGMT_RECV]       = NVME_CMD_EFF_CSUPP,
251 };
252 
253 static void nvme_process_sq(void *opaque);
254 
255 static uint16_t nvme_sqid(NvmeRequest *req)
256 {
257     return le16_to_cpu(req->sq->sqid);
258 }
259 
260 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
261                                    NvmeZoneState state)
262 {
263     if (QTAILQ_IN_USE(zone, entry)) {
264         switch (nvme_get_zone_state(zone)) {
265         case NVME_ZONE_STATE_EXPLICITLY_OPEN:
266             QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
267             break;
268         case NVME_ZONE_STATE_IMPLICITLY_OPEN:
269             QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
270             break;
271         case NVME_ZONE_STATE_CLOSED:
272             QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
273             break;
274         case NVME_ZONE_STATE_FULL:
275             QTAILQ_REMOVE(&ns->full_zones, zone, entry);
276         default:
277             ;
278         }
279     }
280 
281     nvme_set_zone_state(zone, state);
282 
283     switch (state) {
284     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
285         QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
286         break;
287     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
288         QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
289         break;
290     case NVME_ZONE_STATE_CLOSED:
291         QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
292         break;
293     case NVME_ZONE_STATE_FULL:
294         QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
295     case NVME_ZONE_STATE_READ_ONLY:
296         break;
297     default:
298         zone->d.za = 0;
299     }
300 }
301 
302 /*
303  * Check if we can open a zone without exceeding open/active limits.
304  * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
305  */
306 static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
307 {
308     if (ns->params.max_active_zones != 0 &&
309         ns->nr_active_zones + act > ns->params.max_active_zones) {
310         trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
311         return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
312     }
313     if (ns->params.max_open_zones != 0 &&
314         ns->nr_open_zones + opn > ns->params.max_open_zones) {
315         trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
316         return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
317     }
318 
319     return NVME_SUCCESS;
320 }
321 
322 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
323 {
324     hwaddr hi, lo;
325 
326     if (!n->cmb.cmse) {
327         return false;
328     }
329 
330     lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
331     hi = lo + int128_get64(n->cmb.mem.size);
332 
333     return addr >= lo && addr < hi;
334 }
335 
336 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
337 {
338     hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
339     return &n->cmb.buf[addr - base];
340 }
341 
342 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
343 {
344     hwaddr hi;
345 
346     if (!n->pmr.cmse) {
347         return false;
348     }
349 
350     hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
351 
352     return addr >= n->pmr.cba && addr < hi;
353 }
354 
355 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
356 {
357     return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
358 }
359 
360 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
361 {
362     hwaddr hi = addr + size - 1;
363     if (hi < addr) {
364         return 1;
365     }
366 
367     if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
368         memcpy(buf, nvme_addr_to_cmb(n, addr), size);
369         return 0;
370     }
371 
372     if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
373         memcpy(buf, nvme_addr_to_pmr(n, addr), size);
374         return 0;
375     }
376 
377     return pci_dma_read(&n->parent_obj, addr, buf, size);
378 }
379 
380 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
381 {
382     hwaddr hi = addr + size - 1;
383     if (hi < addr) {
384         return 1;
385     }
386 
387     if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
388         memcpy(nvme_addr_to_cmb(n, addr), buf, size);
389         return 0;
390     }
391 
392     if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
393         memcpy(nvme_addr_to_pmr(n, addr), buf, size);
394         return 0;
395     }
396 
397     return pci_dma_write(&n->parent_obj, addr, buf, size);
398 }
399 
400 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
401 {
402     return nsid &&
403         (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
404 }
405 
406 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
407 {
408     return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
409 }
410 
411 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
412 {
413     return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
414 }
415 
416 static void nvme_inc_cq_tail(NvmeCQueue *cq)
417 {
418     cq->tail++;
419     if (cq->tail >= cq->size) {
420         cq->tail = 0;
421         cq->phase = !cq->phase;
422     }
423 }
424 
425 static void nvme_inc_sq_head(NvmeSQueue *sq)
426 {
427     sq->head = (sq->head + 1) % sq->size;
428 }
429 
430 static uint8_t nvme_cq_full(NvmeCQueue *cq)
431 {
432     return (cq->tail + 1) % cq->size == cq->head;
433 }
434 
435 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
436 {
437     return sq->head == sq->tail;
438 }
439 
440 static void nvme_irq_check(NvmeCtrl *n)
441 {
442     uint32_t intms = ldl_le_p(&n->bar.intms);
443 
444     if (msix_enabled(&(n->parent_obj))) {
445         return;
446     }
447     if (~intms & n->irq_status) {
448         pci_irq_assert(&n->parent_obj);
449     } else {
450         pci_irq_deassert(&n->parent_obj);
451     }
452 }
453 
454 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
455 {
456     if (cq->irq_enabled) {
457         if (msix_enabled(&(n->parent_obj))) {
458             trace_pci_nvme_irq_msix(cq->vector);
459             msix_notify(&(n->parent_obj), cq->vector);
460         } else {
461             trace_pci_nvme_irq_pin();
462             assert(cq->vector < 32);
463             n->irq_status |= 1 << cq->vector;
464             nvme_irq_check(n);
465         }
466     } else {
467         trace_pci_nvme_irq_masked();
468     }
469 }
470 
471 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
472 {
473     if (cq->irq_enabled) {
474         if (msix_enabled(&(n->parent_obj))) {
475             return;
476         } else {
477             assert(cq->vector < 32);
478             if (!n->cq_pending) {
479                 n->irq_status &= ~(1 << cq->vector);
480             }
481             nvme_irq_check(n);
482         }
483     }
484 }
485 
486 static void nvme_req_clear(NvmeRequest *req)
487 {
488     req->ns = NULL;
489     req->opaque = NULL;
490     req->aiocb = NULL;
491     memset(&req->cqe, 0x0, sizeof(req->cqe));
492     req->status = NVME_SUCCESS;
493 }
494 
495 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
496 {
497     if (dma) {
498         pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
499         sg->flags = NVME_SG_DMA;
500     } else {
501         qemu_iovec_init(&sg->iov, 0);
502     }
503 
504     sg->flags |= NVME_SG_ALLOC;
505 }
506 
507 static inline void nvme_sg_unmap(NvmeSg *sg)
508 {
509     if (!(sg->flags & NVME_SG_ALLOC)) {
510         return;
511     }
512 
513     if (sg->flags & NVME_SG_DMA) {
514         qemu_sglist_destroy(&sg->qsg);
515     } else {
516         qemu_iovec_destroy(&sg->iov);
517     }
518 
519     memset(sg, 0x0, sizeof(*sg));
520 }
521 
522 /*
523  * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
524  * holds both data and metadata. This function splits the data and metadata
525  * into two separate QSG/IOVs.
526  */
527 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
528                           NvmeSg *mdata)
529 {
530     NvmeSg *dst = data;
531     uint32_t trans_len, count = ns->lbasz;
532     uint64_t offset = 0;
533     bool dma = sg->flags & NVME_SG_DMA;
534     size_t sge_len;
535     size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
536     int sg_idx = 0;
537 
538     assert(sg->flags & NVME_SG_ALLOC);
539 
540     while (sg_len) {
541         sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
542 
543         trans_len = MIN(sg_len, count);
544         trans_len = MIN(trans_len, sge_len - offset);
545 
546         if (dst) {
547             if (dma) {
548                 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
549                                 trans_len);
550             } else {
551                 qemu_iovec_add(&dst->iov,
552                                sg->iov.iov[sg_idx].iov_base + offset,
553                                trans_len);
554             }
555         }
556 
557         sg_len -= trans_len;
558         count -= trans_len;
559         offset += trans_len;
560 
561         if (count == 0) {
562             dst = (dst == data) ? mdata : data;
563             count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
564         }
565 
566         if (sge_len == offset) {
567             offset = 0;
568             sg_idx++;
569         }
570     }
571 }
572 
573 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
574                                   size_t len)
575 {
576     if (!len) {
577         return NVME_SUCCESS;
578     }
579 
580     trace_pci_nvme_map_addr_cmb(addr, len);
581 
582     if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
583         return NVME_DATA_TRAS_ERROR;
584     }
585 
586     qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
587 
588     return NVME_SUCCESS;
589 }
590 
591 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
592                                   size_t len)
593 {
594     if (!len) {
595         return NVME_SUCCESS;
596     }
597 
598     if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
599         return NVME_DATA_TRAS_ERROR;
600     }
601 
602     qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
603 
604     return NVME_SUCCESS;
605 }
606 
607 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
608 {
609     bool cmb = false, pmr = false;
610 
611     if (!len) {
612         return NVME_SUCCESS;
613     }
614 
615     trace_pci_nvme_map_addr(addr, len);
616 
617     if (nvme_addr_is_cmb(n, addr)) {
618         cmb = true;
619     } else if (nvme_addr_is_pmr(n, addr)) {
620         pmr = true;
621     }
622 
623     if (cmb || pmr) {
624         if (sg->flags & NVME_SG_DMA) {
625             return NVME_INVALID_USE_OF_CMB | NVME_DNR;
626         }
627 
628         if (sg->iov.niov + 1 > IOV_MAX) {
629             goto max_mappings_exceeded;
630         }
631 
632         if (cmb) {
633             return nvme_map_addr_cmb(n, &sg->iov, addr, len);
634         } else {
635             return nvme_map_addr_pmr(n, &sg->iov, addr, len);
636         }
637     }
638 
639     if (!(sg->flags & NVME_SG_DMA)) {
640         return NVME_INVALID_USE_OF_CMB | NVME_DNR;
641     }
642 
643     if (sg->qsg.nsg + 1 > IOV_MAX) {
644         goto max_mappings_exceeded;
645     }
646 
647     qemu_sglist_add(&sg->qsg, addr, len);
648 
649     return NVME_SUCCESS;
650 
651 max_mappings_exceeded:
652     NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
653                    "number of mappings exceed 1024");
654     return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
655 }
656 
657 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
658 {
659     return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
660 }
661 
662 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
663                              uint64_t prp2, uint32_t len)
664 {
665     hwaddr trans_len = n->page_size - (prp1 % n->page_size);
666     trans_len = MIN(len, trans_len);
667     int num_prps = (len >> n->page_bits) + 1;
668     uint16_t status;
669     int ret;
670 
671     trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
672 
673     nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
674 
675     status = nvme_map_addr(n, sg, prp1, trans_len);
676     if (status) {
677         goto unmap;
678     }
679 
680     len -= trans_len;
681     if (len) {
682         if (len > n->page_size) {
683             uint64_t prp_list[n->max_prp_ents];
684             uint32_t nents, prp_trans;
685             int i = 0;
686 
687             /*
688              * The first PRP list entry, pointed to by PRP2 may contain offset.
689              * Hence, we need to calculate the number of entries in based on
690              * that offset.
691              */
692             nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
693             prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
694             ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
695             if (ret) {
696                 trace_pci_nvme_err_addr_read(prp2);
697                 status = NVME_DATA_TRAS_ERROR;
698                 goto unmap;
699             }
700             while (len != 0) {
701                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
702 
703                 if (i == nents - 1 && len > n->page_size) {
704                     if (unlikely(prp_ent & (n->page_size - 1))) {
705                         trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
706                         status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
707                         goto unmap;
708                     }
709 
710                     i = 0;
711                     nents = (len + n->page_size - 1) >> n->page_bits;
712                     nents = MIN(nents, n->max_prp_ents);
713                     prp_trans = nents * sizeof(uint64_t);
714                     ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
715                                          prp_trans);
716                     if (ret) {
717                         trace_pci_nvme_err_addr_read(prp_ent);
718                         status = NVME_DATA_TRAS_ERROR;
719                         goto unmap;
720                     }
721                     prp_ent = le64_to_cpu(prp_list[i]);
722                 }
723 
724                 if (unlikely(prp_ent & (n->page_size - 1))) {
725                     trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
726                     status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
727                     goto unmap;
728                 }
729 
730                 trans_len = MIN(len, n->page_size);
731                 status = nvme_map_addr(n, sg, prp_ent, trans_len);
732                 if (status) {
733                     goto unmap;
734                 }
735 
736                 len -= trans_len;
737                 i++;
738             }
739         } else {
740             if (unlikely(prp2 & (n->page_size - 1))) {
741                 trace_pci_nvme_err_invalid_prp2_align(prp2);
742                 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
743                 goto unmap;
744             }
745             status = nvme_map_addr(n, sg, prp2, len);
746             if (status) {
747                 goto unmap;
748             }
749         }
750     }
751 
752     return NVME_SUCCESS;
753 
754 unmap:
755     nvme_sg_unmap(sg);
756     return status;
757 }
758 
759 /*
760  * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
761  * number of bytes mapped in len.
762  */
763 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
764                                   NvmeSglDescriptor *segment, uint64_t nsgld,
765                                   size_t *len, NvmeCmd *cmd)
766 {
767     dma_addr_t addr, trans_len;
768     uint32_t dlen;
769     uint16_t status;
770 
771     for (int i = 0; i < nsgld; i++) {
772         uint8_t type = NVME_SGL_TYPE(segment[i].type);
773 
774         switch (type) {
775         case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
776             if (cmd->opcode == NVME_CMD_WRITE) {
777                 continue;
778             }
779         case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
780             break;
781         case NVME_SGL_DESCR_TYPE_SEGMENT:
782         case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
783             return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
784         default:
785             return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
786         }
787 
788         dlen = le32_to_cpu(segment[i].len);
789 
790         if (!dlen) {
791             continue;
792         }
793 
794         if (*len == 0) {
795             /*
796              * All data has been mapped, but the SGL contains additional
797              * segments and/or descriptors. The controller might accept
798              * ignoring the rest of the SGL.
799              */
800             uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
801             if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
802                 break;
803             }
804 
805             trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
806             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
807         }
808 
809         trans_len = MIN(*len, dlen);
810 
811         if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
812             goto next;
813         }
814 
815         addr = le64_to_cpu(segment[i].addr);
816 
817         if (UINT64_MAX - addr < dlen) {
818             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
819         }
820 
821         status = nvme_map_addr(n, sg, addr, trans_len);
822         if (status) {
823             return status;
824         }
825 
826 next:
827         *len -= trans_len;
828     }
829 
830     return NVME_SUCCESS;
831 }
832 
833 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
834                              size_t len, NvmeCmd *cmd)
835 {
836     /*
837      * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
838      * dynamically allocating a potentially huge SGL. The spec allows the SGL
839      * to be larger (as in number of bytes required to describe the SGL
840      * descriptors and segment chain) than the command transfer size, so it is
841      * not bounded by MDTS.
842      */
843     const int SEG_CHUNK_SIZE = 256;
844 
845     NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
846     uint64_t nsgld;
847     uint32_t seg_len;
848     uint16_t status;
849     hwaddr addr;
850     int ret;
851 
852     sgld = &sgl;
853     addr = le64_to_cpu(sgl.addr);
854 
855     trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
856 
857     nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
858 
859     /*
860      * If the entire transfer can be described with a single data block it can
861      * be mapped directly.
862      */
863     if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
864         status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
865         if (status) {
866             goto unmap;
867         }
868 
869         goto out;
870     }
871 
872     for (;;) {
873         switch (NVME_SGL_TYPE(sgld->type)) {
874         case NVME_SGL_DESCR_TYPE_SEGMENT:
875         case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
876             break;
877         default:
878             return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
879         }
880 
881         seg_len = le32_to_cpu(sgld->len);
882 
883         /* check the length of the (Last) Segment descriptor */
884         if ((!seg_len || seg_len & 0xf) &&
885             (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
886             return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
887         }
888 
889         if (UINT64_MAX - addr < seg_len) {
890             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
891         }
892 
893         nsgld = seg_len / sizeof(NvmeSglDescriptor);
894 
895         while (nsgld > SEG_CHUNK_SIZE) {
896             if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
897                 trace_pci_nvme_err_addr_read(addr);
898                 status = NVME_DATA_TRAS_ERROR;
899                 goto unmap;
900             }
901 
902             status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
903                                        &len, cmd);
904             if (status) {
905                 goto unmap;
906             }
907 
908             nsgld -= SEG_CHUNK_SIZE;
909             addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
910         }
911 
912         ret = nvme_addr_read(n, addr, segment, nsgld *
913                              sizeof(NvmeSglDescriptor));
914         if (ret) {
915             trace_pci_nvme_err_addr_read(addr);
916             status = NVME_DATA_TRAS_ERROR;
917             goto unmap;
918         }
919 
920         last_sgld = &segment[nsgld - 1];
921 
922         /*
923          * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
924          * then we are done.
925          */
926         switch (NVME_SGL_TYPE(last_sgld->type)) {
927         case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
928         case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
929             status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
930             if (status) {
931                 goto unmap;
932             }
933 
934             goto out;
935 
936         default:
937             break;
938         }
939 
940         /*
941          * If the last descriptor was not a Data Block or Bit Bucket, then the
942          * current segment must not be a Last Segment.
943          */
944         if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
945             status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
946             goto unmap;
947         }
948 
949         sgld = last_sgld;
950         addr = le64_to_cpu(sgld->addr);
951 
952         /*
953          * Do not map the last descriptor; it will be a Segment or Last Segment
954          * descriptor and is handled by the next iteration.
955          */
956         status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
957         if (status) {
958             goto unmap;
959         }
960     }
961 
962 out:
963     /* if there is any residual left in len, the SGL was too short */
964     if (len) {
965         status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
966         goto unmap;
967     }
968 
969     return NVME_SUCCESS;
970 
971 unmap:
972     nvme_sg_unmap(sg);
973     return status;
974 }
975 
976 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
977                        NvmeCmd *cmd)
978 {
979     uint64_t prp1, prp2;
980 
981     switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
982     case NVME_PSDT_PRP:
983         prp1 = le64_to_cpu(cmd->dptr.prp1);
984         prp2 = le64_to_cpu(cmd->dptr.prp2);
985 
986         return nvme_map_prp(n, sg, prp1, prp2, len);
987     case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
988     case NVME_PSDT_SGL_MPTR_SGL:
989         return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
990     default:
991         return NVME_INVALID_FIELD;
992     }
993 }
994 
995 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
996                               NvmeCmd *cmd)
997 {
998     int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
999     hwaddr mptr = le64_to_cpu(cmd->mptr);
1000     uint16_t status;
1001 
1002     if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1003         NvmeSglDescriptor sgl;
1004 
1005         if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1006             return NVME_DATA_TRAS_ERROR;
1007         }
1008 
1009         status = nvme_map_sgl(n, sg, sgl, len, cmd);
1010         if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1011             status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1012         }
1013 
1014         return status;
1015     }
1016 
1017     nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1018     status = nvme_map_addr(n, sg, mptr, len);
1019     if (status) {
1020         nvme_sg_unmap(sg);
1021     }
1022 
1023     return status;
1024 }
1025 
1026 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1027 {
1028     NvmeNamespace *ns = req->ns;
1029     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1030     bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1031     bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1032     size_t len = nvme_l2b(ns, nlb);
1033     uint16_t status;
1034 
1035     if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1036         NvmeSg sg;
1037 
1038         len += nvme_m2b(ns, nlb);
1039 
1040         status = nvme_map_dptr(n, &sg, len, &req->cmd);
1041         if (status) {
1042             return status;
1043         }
1044 
1045         nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1046         nvme_sg_split(&sg, ns, &req->sg, NULL);
1047         nvme_sg_unmap(&sg);
1048 
1049         return NVME_SUCCESS;
1050     }
1051 
1052     return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1053 }
1054 
1055 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1056 {
1057     NvmeNamespace *ns = req->ns;
1058     size_t len = nvme_m2b(ns, nlb);
1059     uint16_t status;
1060 
1061     if (nvme_ns_ext(ns)) {
1062         NvmeSg sg;
1063 
1064         len += nvme_l2b(ns, nlb);
1065 
1066         status = nvme_map_dptr(n, &sg, len, &req->cmd);
1067         if (status) {
1068             return status;
1069         }
1070 
1071         nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1072         nvme_sg_split(&sg, ns, NULL, &req->sg);
1073         nvme_sg_unmap(&sg);
1074 
1075         return NVME_SUCCESS;
1076     }
1077 
1078     return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1079 }
1080 
1081 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1082                                     uint32_t len, uint32_t bytes,
1083                                     int32_t skip_bytes, int64_t offset,
1084                                     NvmeTxDirection dir)
1085 {
1086     hwaddr addr;
1087     uint32_t trans_len, count = bytes;
1088     bool dma = sg->flags & NVME_SG_DMA;
1089     int64_t sge_len;
1090     int sg_idx = 0;
1091     int ret;
1092 
1093     assert(sg->flags & NVME_SG_ALLOC);
1094 
1095     while (len) {
1096         sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1097 
1098         if (sge_len - offset < 0) {
1099             offset -= sge_len;
1100             sg_idx++;
1101             continue;
1102         }
1103 
1104         if (sge_len == offset) {
1105             offset = 0;
1106             sg_idx++;
1107             continue;
1108         }
1109 
1110         trans_len = MIN(len, count);
1111         trans_len = MIN(trans_len, sge_len - offset);
1112 
1113         if (dma) {
1114             addr = sg->qsg.sg[sg_idx].base + offset;
1115         } else {
1116             addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1117         }
1118 
1119         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1120             ret = nvme_addr_read(n, addr, ptr, trans_len);
1121         } else {
1122             ret = nvme_addr_write(n, addr, ptr, trans_len);
1123         }
1124 
1125         if (ret) {
1126             return NVME_DATA_TRAS_ERROR;
1127         }
1128 
1129         ptr += trans_len;
1130         len -= trans_len;
1131         count -= trans_len;
1132         offset += trans_len;
1133 
1134         if (count == 0) {
1135             count = bytes;
1136             offset += skip_bytes;
1137         }
1138     }
1139 
1140     return NVME_SUCCESS;
1141 }
1142 
1143 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
1144                         NvmeTxDirection dir)
1145 {
1146     assert(sg->flags & NVME_SG_ALLOC);
1147 
1148     if (sg->flags & NVME_SG_DMA) {
1149         uint64_t residual;
1150 
1151         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1152             residual = dma_buf_write(ptr, len, &sg->qsg);
1153         } else {
1154             residual = dma_buf_read(ptr, len, &sg->qsg);
1155         }
1156 
1157         if (unlikely(residual)) {
1158             trace_pci_nvme_err_invalid_dma();
1159             return NVME_INVALID_FIELD | NVME_DNR;
1160         }
1161     } else {
1162         size_t bytes;
1163 
1164         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1165             bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1166         } else {
1167             bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1168         }
1169 
1170         if (unlikely(bytes != len)) {
1171             trace_pci_nvme_err_invalid_dma();
1172             return NVME_INVALID_FIELD | NVME_DNR;
1173         }
1174     }
1175 
1176     return NVME_SUCCESS;
1177 }
1178 
1179 static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1180                                 NvmeRequest *req)
1181 {
1182     uint16_t status;
1183 
1184     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1185     if (status) {
1186         return status;
1187     }
1188 
1189     return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1190 }
1191 
1192 static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1193                                 NvmeRequest *req)
1194 {
1195     uint16_t status;
1196 
1197     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1198     if (status) {
1199         return status;
1200     }
1201 
1202     return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1203 }
1204 
1205 uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1206                           NvmeTxDirection dir, NvmeRequest *req)
1207 {
1208     NvmeNamespace *ns = req->ns;
1209     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1210     bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1211     bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1212 
1213     if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1214         return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1215                                    ns->lbaf.ms, 0, dir);
1216     }
1217 
1218     return nvme_tx(n, &req->sg, ptr, len, dir);
1219 }
1220 
1221 uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1222                            NvmeTxDirection dir, NvmeRequest *req)
1223 {
1224     NvmeNamespace *ns = req->ns;
1225     uint16_t status;
1226 
1227     if (nvme_ns_ext(ns)) {
1228         return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1229                                    ns->lbasz, ns->lbasz, dir);
1230     }
1231 
1232     nvme_sg_unmap(&req->sg);
1233 
1234     status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1235     if (status) {
1236         return status;
1237     }
1238 
1239     return nvme_tx(n, &req->sg, ptr, len, dir);
1240 }
1241 
1242 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1243                                  BlockCompletionFunc *cb, NvmeRequest *req)
1244 {
1245     assert(req->sg.flags & NVME_SG_ALLOC);
1246 
1247     if (req->sg.flags & NVME_SG_DMA) {
1248         req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1249                                   cb, req);
1250     } else {
1251         req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1252     }
1253 }
1254 
1255 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1256                                   BlockCompletionFunc *cb, NvmeRequest *req)
1257 {
1258     assert(req->sg.flags & NVME_SG_ALLOC);
1259 
1260     if (req->sg.flags & NVME_SG_DMA) {
1261         req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1262                                    cb, req);
1263     } else {
1264         req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1265     }
1266 }
1267 
1268 static void nvme_post_cqes(void *opaque)
1269 {
1270     NvmeCQueue *cq = opaque;
1271     NvmeCtrl *n = cq->ctrl;
1272     NvmeRequest *req, *next;
1273     bool pending = cq->head != cq->tail;
1274     int ret;
1275 
1276     QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1277         NvmeSQueue *sq;
1278         hwaddr addr;
1279 
1280         if (nvme_cq_full(cq)) {
1281             break;
1282         }
1283 
1284         sq = req->sq;
1285         req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1286         req->cqe.sq_id = cpu_to_le16(sq->sqid);
1287         req->cqe.sq_head = cpu_to_le16(sq->head);
1288         addr = cq->dma_addr + cq->tail * n->cqe_size;
1289         ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1290                             sizeof(req->cqe));
1291         if (ret) {
1292             trace_pci_nvme_err_addr_write(addr);
1293             trace_pci_nvme_err_cfs();
1294             stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1295             break;
1296         }
1297         QTAILQ_REMOVE(&cq->req_list, req, entry);
1298         nvme_inc_cq_tail(cq);
1299         nvme_sg_unmap(&req->sg);
1300         QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1301     }
1302     if (cq->tail != cq->head) {
1303         if (cq->irq_enabled && !pending) {
1304             n->cq_pending++;
1305         }
1306 
1307         nvme_irq_assert(n, cq);
1308     }
1309 }
1310 
1311 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1312 {
1313     assert(cq->cqid == req->sq->cqid);
1314     trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1315                                           le32_to_cpu(req->cqe.result),
1316                                           le32_to_cpu(req->cqe.dw1),
1317                                           req->status);
1318 
1319     if (req->status) {
1320         trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1321                                       req->status, req->cmd.opcode);
1322     }
1323 
1324     QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1325     QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1326     timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1327 }
1328 
1329 static void nvme_process_aers(void *opaque)
1330 {
1331     NvmeCtrl *n = opaque;
1332     NvmeAsyncEvent *event, *next;
1333 
1334     trace_pci_nvme_process_aers(n->aer_queued);
1335 
1336     QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1337         NvmeRequest *req;
1338         NvmeAerResult *result;
1339 
1340         /* can't post cqe if there is nothing to complete */
1341         if (!n->outstanding_aers) {
1342             trace_pci_nvme_no_outstanding_aers();
1343             break;
1344         }
1345 
1346         /* ignore if masked (cqe posted, but event not cleared) */
1347         if (n->aer_mask & (1 << event->result.event_type)) {
1348             trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1349             continue;
1350         }
1351 
1352         QTAILQ_REMOVE(&n->aer_queue, event, entry);
1353         n->aer_queued--;
1354 
1355         n->aer_mask |= 1 << event->result.event_type;
1356         n->outstanding_aers--;
1357 
1358         req = n->aer_reqs[n->outstanding_aers];
1359 
1360         result = (NvmeAerResult *) &req->cqe.result;
1361         result->event_type = event->result.event_type;
1362         result->event_info = event->result.event_info;
1363         result->log_page = event->result.log_page;
1364         g_free(event);
1365 
1366         trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1367                                     result->log_page);
1368 
1369         nvme_enqueue_req_completion(&n->admin_cq, req);
1370     }
1371 }
1372 
1373 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1374                                uint8_t event_info, uint8_t log_page)
1375 {
1376     NvmeAsyncEvent *event;
1377 
1378     trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1379 
1380     if (n->aer_queued == n->params.aer_max_queued) {
1381         trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1382         return;
1383     }
1384 
1385     event = g_new(NvmeAsyncEvent, 1);
1386     event->result = (NvmeAerResult) {
1387         .event_type = event_type,
1388         .event_info = event_info,
1389         .log_page   = log_page,
1390     };
1391 
1392     QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1393     n->aer_queued++;
1394 
1395     nvme_process_aers(n);
1396 }
1397 
1398 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1399 {
1400     uint8_t aer_info;
1401 
1402     /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1403     if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1404         return;
1405     }
1406 
1407     switch (event) {
1408     case NVME_SMART_SPARE:
1409         aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1410         break;
1411     case NVME_SMART_TEMPERATURE:
1412         aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1413         break;
1414     case NVME_SMART_RELIABILITY:
1415     case NVME_SMART_MEDIA_READ_ONLY:
1416     case NVME_SMART_FAILED_VOLATILE_MEDIA:
1417     case NVME_SMART_PMR_UNRELIABLE:
1418         aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1419         break;
1420     default:
1421         return;
1422     }
1423 
1424     nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1425 }
1426 
1427 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1428 {
1429     n->aer_mask &= ~(1 << event_type);
1430     if (!QTAILQ_EMPTY(&n->aer_queue)) {
1431         nvme_process_aers(n);
1432     }
1433 }
1434 
1435 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1436 {
1437     uint8_t mdts = n->params.mdts;
1438 
1439     if (mdts && len > n->page_size << mdts) {
1440         trace_pci_nvme_err_mdts(len);
1441         return NVME_INVALID_FIELD | NVME_DNR;
1442     }
1443 
1444     return NVME_SUCCESS;
1445 }
1446 
1447 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1448                                          uint32_t nlb)
1449 {
1450     uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1451 
1452     if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1453         trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1454         return NVME_LBA_RANGE | NVME_DNR;
1455     }
1456 
1457     return NVME_SUCCESS;
1458 }
1459 
1460 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1461                                  uint32_t nlb, int flags)
1462 {
1463     BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1464 
1465     int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1466     int64_t offset = nvme_l2b(ns, slba);
1467     int ret;
1468 
1469     /*
1470      * `pnum` holds the number of bytes after offset that shares the same
1471      * allocation status as the byte at offset. If `pnum` is different from
1472      * `bytes`, we should check the allocation status of the next range and
1473      * continue this until all bytes have been checked.
1474      */
1475     do {
1476         bytes -= pnum;
1477 
1478         ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1479         if (ret < 0) {
1480             return ret;
1481         }
1482 
1483 
1484         trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1485                                     !!(ret & BDRV_BLOCK_ZERO));
1486 
1487         if (!(ret & flags)) {
1488             return 1;
1489         }
1490 
1491         offset += pnum;
1492     } while (pnum != bytes);
1493 
1494     return 0;
1495 }
1496 
1497 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1498                                  uint32_t nlb)
1499 {
1500     int ret;
1501     Error *err = NULL;
1502 
1503     ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1504     if (ret) {
1505         if (ret < 0) {
1506             error_setg_errno(&err, -ret, "unable to get block status");
1507             error_report_err(err);
1508 
1509             return NVME_INTERNAL_DEV_ERROR;
1510         }
1511 
1512         return NVME_DULB;
1513     }
1514 
1515     return NVME_SUCCESS;
1516 }
1517 
1518 static void nvme_aio_err(NvmeRequest *req, int ret)
1519 {
1520     uint16_t status = NVME_SUCCESS;
1521     Error *local_err = NULL;
1522 
1523     switch (req->cmd.opcode) {
1524     case NVME_CMD_READ:
1525         status = NVME_UNRECOVERED_READ;
1526         break;
1527     case NVME_CMD_FLUSH:
1528     case NVME_CMD_WRITE:
1529     case NVME_CMD_WRITE_ZEROES:
1530     case NVME_CMD_ZONE_APPEND:
1531         status = NVME_WRITE_FAULT;
1532         break;
1533     default:
1534         status = NVME_INTERNAL_DEV_ERROR;
1535         break;
1536     }
1537 
1538     trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1539 
1540     error_setg_errno(&local_err, -ret, "aio failed");
1541     error_report_err(local_err);
1542 
1543     /*
1544      * Set the command status code to the first encountered error but allow a
1545      * subsequent Internal Device Error to trump it.
1546      */
1547     if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1548         return;
1549     }
1550 
1551     req->status = status;
1552 }
1553 
1554 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1555 {
1556     return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1557                                     slba / ns->zone_size;
1558 }
1559 
1560 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1561 {
1562     uint32_t zone_idx = nvme_zone_idx(ns, slba);
1563 
1564     if (zone_idx >= ns->num_zones) {
1565         return NULL;
1566     }
1567 
1568     return &ns->zone_array[zone_idx];
1569 }
1570 
1571 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1572 {
1573     uint64_t zslba = zone->d.zslba;
1574 
1575     switch (nvme_get_zone_state(zone)) {
1576     case NVME_ZONE_STATE_EMPTY:
1577     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1578     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1579     case NVME_ZONE_STATE_CLOSED:
1580         return NVME_SUCCESS;
1581     case NVME_ZONE_STATE_FULL:
1582         trace_pci_nvme_err_zone_is_full(zslba);
1583         return NVME_ZONE_FULL;
1584     case NVME_ZONE_STATE_OFFLINE:
1585         trace_pci_nvme_err_zone_is_offline(zslba);
1586         return NVME_ZONE_OFFLINE;
1587     case NVME_ZONE_STATE_READ_ONLY:
1588         trace_pci_nvme_err_zone_is_read_only(zslba);
1589         return NVME_ZONE_READ_ONLY;
1590     default:
1591         assert(false);
1592     }
1593 
1594     return NVME_INTERNAL_DEV_ERROR;
1595 }
1596 
1597 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1598                                       uint64_t slba, uint32_t nlb)
1599 {
1600     uint64_t zcap = nvme_zone_wr_boundary(zone);
1601     uint16_t status;
1602 
1603     status = nvme_check_zone_state_for_write(zone);
1604     if (status) {
1605         return status;
1606     }
1607 
1608     if (unlikely(slba != zone->w_ptr)) {
1609         trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
1610         return NVME_ZONE_INVALID_WRITE;
1611     }
1612 
1613     if (unlikely((slba + nlb) > zcap)) {
1614         trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1615         return NVME_ZONE_BOUNDARY_ERROR;
1616     }
1617 
1618     return NVME_SUCCESS;
1619 }
1620 
1621 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1622 {
1623     switch (nvme_get_zone_state(zone)) {
1624     case NVME_ZONE_STATE_EMPTY:
1625     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1626     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1627     case NVME_ZONE_STATE_FULL:
1628     case NVME_ZONE_STATE_CLOSED:
1629     case NVME_ZONE_STATE_READ_ONLY:
1630         return NVME_SUCCESS;
1631     case NVME_ZONE_STATE_OFFLINE:
1632         trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1633         return NVME_ZONE_OFFLINE;
1634     default:
1635         assert(false);
1636     }
1637 
1638     return NVME_INTERNAL_DEV_ERROR;
1639 }
1640 
1641 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1642                                      uint32_t nlb)
1643 {
1644     NvmeZone *zone;
1645     uint64_t bndry, end;
1646     uint16_t status;
1647 
1648     zone = nvme_get_zone_by_slba(ns, slba);
1649     assert(zone);
1650 
1651     bndry = nvme_zone_rd_boundary(ns, zone);
1652     end = slba + nlb;
1653 
1654     status = nvme_check_zone_state_for_read(zone);
1655     if (status) {
1656         ;
1657     } else if (unlikely(end > bndry)) {
1658         if (!ns->params.cross_zone_read) {
1659             status = NVME_ZONE_BOUNDARY_ERROR;
1660         } else {
1661             /*
1662              * Read across zone boundary - check that all subsequent
1663              * zones that are being read have an appropriate state.
1664              */
1665             do {
1666                 zone++;
1667                 status = nvme_check_zone_state_for_read(zone);
1668                 if (status) {
1669                     break;
1670                 }
1671             } while (end > nvme_zone_rd_boundary(ns, zone));
1672         }
1673     }
1674 
1675     return status;
1676 }
1677 
1678 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1679 {
1680     switch (nvme_get_zone_state(zone)) {
1681     case NVME_ZONE_STATE_FULL:
1682         return NVME_SUCCESS;
1683 
1684     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1685     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1686         nvme_aor_dec_open(ns);
1687         /* fallthrough */
1688     case NVME_ZONE_STATE_CLOSED:
1689         nvme_aor_dec_active(ns);
1690         /* fallthrough */
1691     case NVME_ZONE_STATE_EMPTY:
1692         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1693         return NVME_SUCCESS;
1694 
1695     default:
1696         return NVME_ZONE_INVAL_TRANSITION;
1697     }
1698 }
1699 
1700 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1701 {
1702     switch (nvme_get_zone_state(zone)) {
1703     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1704     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1705         nvme_aor_dec_open(ns);
1706         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1707         /* fall through */
1708     case NVME_ZONE_STATE_CLOSED:
1709         return NVME_SUCCESS;
1710 
1711     default:
1712         return NVME_ZONE_INVAL_TRANSITION;
1713     }
1714 }
1715 
1716 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1717 {
1718     switch (nvme_get_zone_state(zone)) {
1719     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1720     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1721         nvme_aor_dec_open(ns);
1722         /* fallthrough */
1723     case NVME_ZONE_STATE_CLOSED:
1724         nvme_aor_dec_active(ns);
1725         /* fallthrough */
1726     case NVME_ZONE_STATE_FULL:
1727         zone->w_ptr = zone->d.zslba;
1728         zone->d.wp = zone->w_ptr;
1729         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1730         /* fallthrough */
1731     case NVME_ZONE_STATE_EMPTY:
1732         return NVME_SUCCESS;
1733 
1734     default:
1735         return NVME_ZONE_INVAL_TRANSITION;
1736     }
1737 }
1738 
1739 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1740 {
1741     NvmeZone *zone;
1742 
1743     if (ns->params.max_open_zones &&
1744         ns->nr_open_zones == ns->params.max_open_zones) {
1745         zone = QTAILQ_FIRST(&ns->imp_open_zones);
1746         if (zone) {
1747             /*
1748              * Automatically close this implicitly open zone.
1749              */
1750             QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1751             nvme_zrm_close(ns, zone);
1752         }
1753     }
1754 }
1755 
1756 enum {
1757     NVME_ZRM_AUTO = 1 << 0,
1758 };
1759 
1760 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1761                                     NvmeZone *zone, int flags)
1762 {
1763     int act = 0;
1764     uint16_t status;
1765 
1766     switch (nvme_get_zone_state(zone)) {
1767     case NVME_ZONE_STATE_EMPTY:
1768         act = 1;
1769 
1770         /* fallthrough */
1771 
1772     case NVME_ZONE_STATE_CLOSED:
1773         if (n->params.auto_transition_zones) {
1774             nvme_zrm_auto_transition_zone(ns);
1775         }
1776         status = nvme_aor_check(ns, act, 1);
1777         if (status) {
1778             return status;
1779         }
1780 
1781         if (act) {
1782             nvme_aor_inc_active(ns);
1783         }
1784 
1785         nvme_aor_inc_open(ns);
1786 
1787         if (flags & NVME_ZRM_AUTO) {
1788             nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1789             return NVME_SUCCESS;
1790         }
1791 
1792         /* fallthrough */
1793 
1794     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1795         if (flags & NVME_ZRM_AUTO) {
1796             return NVME_SUCCESS;
1797         }
1798 
1799         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1800 
1801         /* fallthrough */
1802 
1803     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1804         return NVME_SUCCESS;
1805 
1806     default:
1807         return NVME_ZONE_INVAL_TRANSITION;
1808     }
1809 }
1810 
1811 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
1812                                      NvmeZone *zone)
1813 {
1814     return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
1815 }
1816 
1817 static inline uint16_t nvme_zrm_open(NvmeCtrl *n, NvmeNamespace *ns,
1818                                      NvmeZone *zone)
1819 {
1820     return nvme_zrm_open_flags(n, ns, zone, 0);
1821 }
1822 
1823 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1824                                  uint32_t nlb)
1825 {
1826     zone->d.wp += nlb;
1827 
1828     if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1829         nvme_zrm_finish(ns, zone);
1830     }
1831 }
1832 
1833 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1834 {
1835     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1836     NvmeZone *zone;
1837     uint64_t slba;
1838     uint32_t nlb;
1839 
1840     slba = le64_to_cpu(rw->slba);
1841     nlb = le16_to_cpu(rw->nlb) + 1;
1842     zone = nvme_get_zone_by_slba(ns, slba);
1843     assert(zone);
1844 
1845     nvme_advance_zone_wp(ns, zone, nlb);
1846 }
1847 
1848 static inline bool nvme_is_write(NvmeRequest *req)
1849 {
1850     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1851 
1852     return rw->opcode == NVME_CMD_WRITE ||
1853            rw->opcode == NVME_CMD_ZONE_APPEND ||
1854            rw->opcode == NVME_CMD_WRITE_ZEROES;
1855 }
1856 
1857 static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
1858 {
1859     return qemu_get_aio_context();
1860 }
1861 
1862 static void nvme_misc_cb(void *opaque, int ret)
1863 {
1864     NvmeRequest *req = opaque;
1865 
1866     trace_pci_nvme_misc_cb(nvme_cid(req));
1867 
1868     if (ret) {
1869         nvme_aio_err(req, ret);
1870     }
1871 
1872     nvme_enqueue_req_completion(nvme_cq(req), req);
1873 }
1874 
1875 void nvme_rw_complete_cb(void *opaque, int ret)
1876 {
1877     NvmeRequest *req = opaque;
1878     NvmeNamespace *ns = req->ns;
1879     BlockBackend *blk = ns->blkconf.blk;
1880     BlockAcctCookie *acct = &req->acct;
1881     BlockAcctStats *stats = blk_get_stats(blk);
1882 
1883     trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1884 
1885     if (ret) {
1886         block_acct_failed(stats, acct);
1887         nvme_aio_err(req, ret);
1888     } else {
1889         block_acct_done(stats, acct);
1890     }
1891 
1892     if (ns->params.zoned && nvme_is_write(req)) {
1893         nvme_finalize_zoned_write(ns, req);
1894     }
1895 
1896     nvme_enqueue_req_completion(nvme_cq(req), req);
1897 }
1898 
1899 static void nvme_rw_cb(void *opaque, int ret)
1900 {
1901     NvmeRequest *req = opaque;
1902     NvmeNamespace *ns = req->ns;
1903 
1904     BlockBackend *blk = ns->blkconf.blk;
1905 
1906     trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1907 
1908     if (ret) {
1909         goto out;
1910     }
1911 
1912     if (ns->lbaf.ms) {
1913         NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1914         uint64_t slba = le64_to_cpu(rw->slba);
1915         uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
1916         uint64_t offset = nvme_moff(ns, slba);
1917 
1918         if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
1919             size_t mlen = nvme_m2b(ns, nlb);
1920 
1921             req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
1922                                                BDRV_REQ_MAY_UNMAP,
1923                                                nvme_rw_complete_cb, req);
1924             return;
1925         }
1926 
1927         if (nvme_ns_ext(ns) || req->cmd.mptr) {
1928             uint16_t status;
1929 
1930             nvme_sg_unmap(&req->sg);
1931             status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
1932             if (status) {
1933                 ret = -EFAULT;
1934                 goto out;
1935             }
1936 
1937             if (req->cmd.opcode == NVME_CMD_READ) {
1938                 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
1939             }
1940 
1941             return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
1942         }
1943     }
1944 
1945 out:
1946     nvme_rw_complete_cb(req, ret);
1947 }
1948 
1949 static void nvme_verify_cb(void *opaque, int ret)
1950 {
1951     NvmeBounceContext *ctx = opaque;
1952     NvmeRequest *req = ctx->req;
1953     NvmeNamespace *ns = req->ns;
1954     BlockBackend *blk = ns->blkconf.blk;
1955     BlockAcctCookie *acct = &req->acct;
1956     BlockAcctStats *stats = blk_get_stats(blk);
1957     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1958     uint64_t slba = le64_to_cpu(rw->slba);
1959     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
1960     uint16_t apptag = le16_to_cpu(rw->apptag);
1961     uint16_t appmask = le16_to_cpu(rw->appmask);
1962     uint32_t reftag = le32_to_cpu(rw->reftag);
1963     uint16_t status;
1964 
1965     trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
1966 
1967     if (ret) {
1968         block_acct_failed(stats, acct);
1969         nvme_aio_err(req, ret);
1970         goto out;
1971     }
1972 
1973     block_acct_done(stats, acct);
1974 
1975     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
1976         status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
1977                                        ctx->mdata.iov.size, slba);
1978         if (status) {
1979             req->status = status;
1980             goto out;
1981         }
1982 
1983         req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
1984                                      ctx->mdata.bounce, ctx->mdata.iov.size,
1985                                      prinfo, slba, apptag, appmask, &reftag);
1986     }
1987 
1988 out:
1989     qemu_iovec_destroy(&ctx->data.iov);
1990     g_free(ctx->data.bounce);
1991 
1992     qemu_iovec_destroy(&ctx->mdata.iov);
1993     g_free(ctx->mdata.bounce);
1994 
1995     g_free(ctx);
1996 
1997     nvme_enqueue_req_completion(nvme_cq(req), req);
1998 }
1999 
2000 
2001 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2002 {
2003     NvmeBounceContext *ctx = opaque;
2004     NvmeRequest *req = ctx->req;
2005     NvmeNamespace *ns = req->ns;
2006     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2007     uint64_t slba = le64_to_cpu(rw->slba);
2008     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2009     size_t mlen = nvme_m2b(ns, nlb);
2010     uint64_t offset = nvme_moff(ns, slba);
2011     BlockBackend *blk = ns->blkconf.blk;
2012 
2013     trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2014 
2015     if (ret) {
2016         goto out;
2017     }
2018 
2019     ctx->mdata.bounce = g_malloc(mlen);
2020 
2021     qemu_iovec_reset(&ctx->mdata.iov);
2022     qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2023 
2024     req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2025                                 nvme_verify_cb, ctx);
2026     return;
2027 
2028 out:
2029     nvme_verify_cb(ctx, ret);
2030 }
2031 
2032 struct nvme_compare_ctx {
2033     struct {
2034         QEMUIOVector iov;
2035         uint8_t *bounce;
2036     } data;
2037 
2038     struct {
2039         QEMUIOVector iov;
2040         uint8_t *bounce;
2041     } mdata;
2042 };
2043 
2044 static void nvme_compare_mdata_cb(void *opaque, int ret)
2045 {
2046     NvmeRequest *req = opaque;
2047     NvmeNamespace *ns = req->ns;
2048     NvmeCtrl *n = nvme_ctrl(req);
2049     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2050     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2051     uint16_t apptag = le16_to_cpu(rw->apptag);
2052     uint16_t appmask = le16_to_cpu(rw->appmask);
2053     uint32_t reftag = le32_to_cpu(rw->reftag);
2054     struct nvme_compare_ctx *ctx = req->opaque;
2055     g_autofree uint8_t *buf = NULL;
2056     BlockBackend *blk = ns->blkconf.blk;
2057     BlockAcctCookie *acct = &req->acct;
2058     BlockAcctStats *stats = blk_get_stats(blk);
2059     uint16_t status = NVME_SUCCESS;
2060 
2061     trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2062 
2063     if (ret) {
2064         block_acct_failed(stats, acct);
2065         nvme_aio_err(req, ret);
2066         goto out;
2067     }
2068 
2069     buf = g_malloc(ctx->mdata.iov.size);
2070 
2071     status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2072                                NVME_TX_DIRECTION_TO_DEVICE, req);
2073     if (status) {
2074         req->status = status;
2075         goto out;
2076     }
2077 
2078     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2079         uint64_t slba = le64_to_cpu(rw->slba);
2080         uint8_t *bufp;
2081         uint8_t *mbufp = ctx->mdata.bounce;
2082         uint8_t *end = mbufp + ctx->mdata.iov.size;
2083         int16_t pil = 0;
2084 
2085         status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2086                                 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2087                                 slba, apptag, appmask, &reftag);
2088         if (status) {
2089             req->status = status;
2090             goto out;
2091         }
2092 
2093         /*
2094          * When formatted with protection information, do not compare the DIF
2095          * tuple.
2096          */
2097         if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2098             pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
2099         }
2100 
2101         for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2102             if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2103                 req->status = NVME_CMP_FAILURE;
2104                 goto out;
2105             }
2106         }
2107 
2108         goto out;
2109     }
2110 
2111     if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2112         req->status = NVME_CMP_FAILURE;
2113         goto out;
2114     }
2115 
2116     block_acct_done(stats, acct);
2117 
2118 out:
2119     qemu_iovec_destroy(&ctx->data.iov);
2120     g_free(ctx->data.bounce);
2121 
2122     qemu_iovec_destroy(&ctx->mdata.iov);
2123     g_free(ctx->mdata.bounce);
2124 
2125     g_free(ctx);
2126 
2127     nvme_enqueue_req_completion(nvme_cq(req), req);
2128 }
2129 
2130 static void nvme_compare_data_cb(void *opaque, int ret)
2131 {
2132     NvmeRequest *req = opaque;
2133     NvmeCtrl *n = nvme_ctrl(req);
2134     NvmeNamespace *ns = req->ns;
2135     BlockBackend *blk = ns->blkconf.blk;
2136     BlockAcctCookie *acct = &req->acct;
2137     BlockAcctStats *stats = blk_get_stats(blk);
2138 
2139     struct nvme_compare_ctx *ctx = req->opaque;
2140     g_autofree uint8_t *buf = NULL;
2141     uint16_t status;
2142 
2143     trace_pci_nvme_compare_data_cb(nvme_cid(req));
2144 
2145     if (ret) {
2146         block_acct_failed(stats, acct);
2147         nvme_aio_err(req, ret);
2148         goto out;
2149     }
2150 
2151     buf = g_malloc(ctx->data.iov.size);
2152 
2153     status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2154                               NVME_TX_DIRECTION_TO_DEVICE, req);
2155     if (status) {
2156         req->status = status;
2157         goto out;
2158     }
2159 
2160     if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2161         req->status = NVME_CMP_FAILURE;
2162         goto out;
2163     }
2164 
2165     if (ns->lbaf.ms) {
2166         NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2167         uint64_t slba = le64_to_cpu(rw->slba);
2168         uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2169         size_t mlen = nvme_m2b(ns, nlb);
2170         uint64_t offset = nvme_moff(ns, slba);
2171 
2172         ctx->mdata.bounce = g_malloc(mlen);
2173 
2174         qemu_iovec_init(&ctx->mdata.iov, 1);
2175         qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2176 
2177         req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2178                                     nvme_compare_mdata_cb, req);
2179         return;
2180     }
2181 
2182     block_acct_done(stats, acct);
2183 
2184 out:
2185     qemu_iovec_destroy(&ctx->data.iov);
2186     g_free(ctx->data.bounce);
2187     g_free(ctx);
2188 
2189     nvme_enqueue_req_completion(nvme_cq(req), req);
2190 }
2191 
2192 typedef struct NvmeDSMAIOCB {
2193     BlockAIOCB common;
2194     BlockAIOCB *aiocb;
2195     NvmeRequest *req;
2196     QEMUBH *bh;
2197     int ret;
2198 
2199     NvmeDsmRange *range;
2200     unsigned int nr;
2201     unsigned int idx;
2202 } NvmeDSMAIOCB;
2203 
2204 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2205 {
2206     NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2207 
2208     /* break nvme_dsm_cb loop */
2209     iocb->idx = iocb->nr;
2210     iocb->ret = -ECANCELED;
2211 
2212     if (iocb->aiocb) {
2213         blk_aio_cancel_async(iocb->aiocb);
2214         iocb->aiocb = NULL;
2215     } else {
2216         /*
2217          * We only reach this if nvme_dsm_cancel() has already been called or
2218          * the command ran to completion and nvme_dsm_bh is scheduled to run.
2219          */
2220         assert(iocb->idx == iocb->nr);
2221     }
2222 }
2223 
2224 static const AIOCBInfo nvme_dsm_aiocb_info = {
2225     .aiocb_size   = sizeof(NvmeDSMAIOCB),
2226     .cancel_async = nvme_dsm_cancel,
2227 };
2228 
2229 static void nvme_dsm_bh(void *opaque)
2230 {
2231     NvmeDSMAIOCB *iocb = opaque;
2232 
2233     iocb->common.cb(iocb->common.opaque, iocb->ret);
2234 
2235     qemu_bh_delete(iocb->bh);
2236     iocb->bh = NULL;
2237     qemu_aio_unref(iocb);
2238 }
2239 
2240 static void nvme_dsm_cb(void *opaque, int ret);
2241 
2242 static void nvme_dsm_md_cb(void *opaque, int ret)
2243 {
2244     NvmeDSMAIOCB *iocb = opaque;
2245     NvmeRequest *req = iocb->req;
2246     NvmeNamespace *ns = req->ns;
2247     NvmeDsmRange *range;
2248     uint64_t slba;
2249     uint32_t nlb;
2250 
2251     if (ret < 0) {
2252         iocb->ret = ret;
2253         goto done;
2254     }
2255 
2256     if (!ns->lbaf.ms) {
2257         nvme_dsm_cb(iocb, 0);
2258         return;
2259     }
2260 
2261     range = &iocb->range[iocb->idx - 1];
2262     slba = le64_to_cpu(range->slba);
2263     nlb = le32_to_cpu(range->nlb);
2264 
2265     /*
2266      * Check that all block were discarded (zeroed); otherwise we do not zero
2267      * the metadata.
2268      */
2269 
2270     ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2271     if (ret) {
2272         if (ret < 0) {
2273             iocb->ret = ret;
2274             goto done;
2275         }
2276 
2277         nvme_dsm_cb(iocb, 0);
2278     }
2279 
2280     iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2281                                         nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2282                                         nvme_dsm_cb, iocb);
2283     return;
2284 
2285 done:
2286     iocb->aiocb = NULL;
2287     qemu_bh_schedule(iocb->bh);
2288 }
2289 
2290 static void nvme_dsm_cb(void *opaque, int ret)
2291 {
2292     NvmeDSMAIOCB *iocb = opaque;
2293     NvmeRequest *req = iocb->req;
2294     NvmeCtrl *n = nvme_ctrl(req);
2295     NvmeNamespace *ns = req->ns;
2296     NvmeDsmRange *range;
2297     uint64_t slba;
2298     uint32_t nlb;
2299 
2300     if (ret < 0) {
2301         iocb->ret = ret;
2302         goto done;
2303     }
2304 
2305 next:
2306     if (iocb->idx == iocb->nr) {
2307         goto done;
2308     }
2309 
2310     range = &iocb->range[iocb->idx++];
2311     slba = le64_to_cpu(range->slba);
2312     nlb = le32_to_cpu(range->nlb);
2313 
2314     trace_pci_nvme_dsm_deallocate(slba, nlb);
2315 
2316     if (nlb > n->dmrsl) {
2317         trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2318         goto next;
2319     }
2320 
2321     if (nvme_check_bounds(ns, slba, nlb)) {
2322         trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2323                                              ns->id_ns.nsze);
2324         goto next;
2325     }
2326 
2327     iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2328                                    nvme_l2b(ns, nlb),
2329                                    nvme_dsm_md_cb, iocb);
2330     return;
2331 
2332 done:
2333     iocb->aiocb = NULL;
2334     qemu_bh_schedule(iocb->bh);
2335 }
2336 
2337 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2338 {
2339     NvmeNamespace *ns = req->ns;
2340     NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2341     uint32_t attr = le32_to_cpu(dsm->attributes);
2342     uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2343     uint16_t status = NVME_SUCCESS;
2344 
2345     trace_pci_nvme_dsm(nr, attr);
2346 
2347     if (attr & NVME_DSMGMT_AD) {
2348         NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2349                                          nvme_misc_cb, req);
2350 
2351         iocb->req = req;
2352         iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
2353         iocb->ret = 0;
2354         iocb->range = g_new(NvmeDsmRange, nr);
2355         iocb->nr = nr;
2356         iocb->idx = 0;
2357 
2358         status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2359                           req);
2360         if (status) {
2361             return status;
2362         }
2363 
2364         req->aiocb = &iocb->common;
2365         nvme_dsm_cb(iocb, 0);
2366 
2367         return NVME_NO_COMPLETE;
2368     }
2369 
2370     return status;
2371 }
2372 
2373 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2374 {
2375     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2376     NvmeNamespace *ns = req->ns;
2377     BlockBackend *blk = ns->blkconf.blk;
2378     uint64_t slba = le64_to_cpu(rw->slba);
2379     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2380     size_t len = nvme_l2b(ns, nlb);
2381     int64_t offset = nvme_l2b(ns, slba);
2382     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2383     uint32_t reftag = le32_to_cpu(rw->reftag);
2384     NvmeBounceContext *ctx = NULL;
2385     uint16_t status;
2386 
2387     trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2388 
2389     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2390         status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2391         if (status) {
2392             return status;
2393         }
2394 
2395         if (prinfo & NVME_PRINFO_PRACT) {
2396             return NVME_INVALID_PROT_INFO | NVME_DNR;
2397         }
2398     }
2399 
2400     if (len > n->page_size << n->params.vsl) {
2401         return NVME_INVALID_FIELD | NVME_DNR;
2402     }
2403 
2404     status = nvme_check_bounds(ns, slba, nlb);
2405     if (status) {
2406         return status;
2407     }
2408 
2409     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2410         status = nvme_check_dulbe(ns, slba, nlb);
2411         if (status) {
2412             return status;
2413         }
2414     }
2415 
2416     ctx = g_new0(NvmeBounceContext, 1);
2417     ctx->req = req;
2418 
2419     ctx->data.bounce = g_malloc(len);
2420 
2421     qemu_iovec_init(&ctx->data.iov, 1);
2422     qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2423 
2424     block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2425                      BLOCK_ACCT_READ);
2426 
2427     req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2428                                 nvme_verify_mdata_in_cb, ctx);
2429     return NVME_NO_COMPLETE;
2430 }
2431 
2432 typedef struct NvmeCopyAIOCB {
2433     BlockAIOCB common;
2434     BlockAIOCB *aiocb;
2435     NvmeRequest *req;
2436     QEMUBH *bh;
2437     int ret;
2438 
2439     NvmeCopySourceRange *ranges;
2440     int nr;
2441     int idx;
2442 
2443     uint8_t *bounce;
2444     QEMUIOVector iov;
2445     struct {
2446         BlockAcctCookie read;
2447         BlockAcctCookie write;
2448     } acct;
2449 
2450     uint32_t reftag;
2451     uint64_t slba;
2452 
2453     NvmeZone *zone;
2454 } NvmeCopyAIOCB;
2455 
2456 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2457 {
2458     NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2459 
2460     iocb->ret = -ECANCELED;
2461 
2462     if (iocb->aiocb) {
2463         blk_aio_cancel_async(iocb->aiocb);
2464         iocb->aiocb = NULL;
2465     }
2466 }
2467 
2468 static const AIOCBInfo nvme_copy_aiocb_info = {
2469     .aiocb_size   = sizeof(NvmeCopyAIOCB),
2470     .cancel_async = nvme_copy_cancel,
2471 };
2472 
2473 static void nvme_copy_bh(void *opaque)
2474 {
2475     NvmeCopyAIOCB *iocb = opaque;
2476     NvmeRequest *req = iocb->req;
2477     NvmeNamespace *ns = req->ns;
2478     BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2479 
2480     if (iocb->idx != iocb->nr) {
2481         req->cqe.result = cpu_to_le32(iocb->idx);
2482     }
2483 
2484     qemu_iovec_destroy(&iocb->iov);
2485     g_free(iocb->bounce);
2486 
2487     qemu_bh_delete(iocb->bh);
2488     iocb->bh = NULL;
2489 
2490     if (iocb->ret < 0) {
2491         block_acct_failed(stats, &iocb->acct.read);
2492         block_acct_failed(stats, &iocb->acct.write);
2493     } else {
2494         block_acct_done(stats, &iocb->acct.read);
2495         block_acct_done(stats, &iocb->acct.write);
2496     }
2497 
2498     iocb->common.cb(iocb->common.opaque, iocb->ret);
2499     qemu_aio_unref(iocb);
2500 }
2501 
2502 static void nvme_copy_cb(void *opaque, int ret);
2503 
2504 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2505 {
2506     NvmeCopyAIOCB *iocb = opaque;
2507     NvmeRequest *req = iocb->req;
2508     NvmeNamespace *ns = req->ns;
2509     NvmeCopySourceRange *range = &iocb->ranges[iocb->idx];
2510     uint32_t nlb = le32_to_cpu(range->nlb) + 1;
2511 
2512     if (ret < 0) {
2513         iocb->ret = ret;
2514         goto out;
2515     } else if (iocb->ret < 0) {
2516         goto out;
2517     }
2518 
2519     if (ns->params.zoned) {
2520         nvme_advance_zone_wp(ns, iocb->zone, nlb);
2521     }
2522 
2523     iocb->idx++;
2524     iocb->slba += nlb;
2525 out:
2526     nvme_copy_cb(iocb, iocb->ret);
2527 }
2528 
2529 static void nvme_copy_out_cb(void *opaque, int ret)
2530 {
2531     NvmeCopyAIOCB *iocb = opaque;
2532     NvmeRequest *req = iocb->req;
2533     NvmeNamespace *ns = req->ns;
2534     NvmeCopySourceRange *range;
2535     uint32_t nlb;
2536     size_t mlen;
2537     uint8_t *mbounce;
2538 
2539     if (ret < 0) {
2540         iocb->ret = ret;
2541         goto out;
2542     } else if (iocb->ret < 0) {
2543         goto out;
2544     }
2545 
2546     if (!ns->lbaf.ms) {
2547         nvme_copy_out_completed_cb(iocb, 0);
2548         return;
2549     }
2550 
2551     range = &iocb->ranges[iocb->idx];
2552     nlb = le32_to_cpu(range->nlb) + 1;
2553 
2554     mlen = nvme_m2b(ns, nlb);
2555     mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2556 
2557     qemu_iovec_reset(&iocb->iov);
2558     qemu_iovec_add(&iocb->iov, mbounce, mlen);
2559 
2560     iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
2561                                   &iocb->iov, 0, nvme_copy_out_completed_cb,
2562                                   iocb);
2563 
2564     return;
2565 
2566 out:
2567     nvme_copy_cb(iocb, ret);
2568 }
2569 
2570 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2571 {
2572     NvmeCopyAIOCB *iocb = opaque;
2573     NvmeRequest *req = iocb->req;
2574     NvmeNamespace *ns = req->ns;
2575     NvmeCopySourceRange *range;
2576     uint32_t nlb;
2577     size_t len;
2578     uint16_t status;
2579 
2580     if (ret < 0) {
2581         iocb->ret = ret;
2582         goto out;
2583     } else if (iocb->ret < 0) {
2584         goto out;
2585     }
2586 
2587     range = &iocb->ranges[iocb->idx];
2588     nlb = le32_to_cpu(range->nlb) + 1;
2589     len = nvme_l2b(ns, nlb);
2590 
2591     trace_pci_nvme_copy_out(iocb->slba, nlb);
2592 
2593     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2594         NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2595 
2596         uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2597         uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2598 
2599         uint16_t apptag = le16_to_cpu(range->apptag);
2600         uint16_t appmask = le16_to_cpu(range->appmask);
2601         uint32_t reftag = le32_to_cpu(range->reftag);
2602 
2603         uint64_t slba = le64_to_cpu(range->slba);
2604         size_t mlen = nvme_m2b(ns, nlb);
2605         uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2606 
2607         status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
2608                                 slba, apptag, appmask, &reftag);
2609         if (status) {
2610             goto invalid;
2611         }
2612 
2613         apptag = le16_to_cpu(copy->apptag);
2614         appmask = le16_to_cpu(copy->appmask);
2615 
2616         if (prinfow & NVME_PRINFO_PRACT) {
2617             status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
2618             if (status) {
2619                 goto invalid;
2620             }
2621 
2622             nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
2623                                         apptag, &iocb->reftag);
2624         } else {
2625             status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
2626                                     prinfow, iocb->slba, apptag, appmask,
2627                                     &iocb->reftag);
2628             if (status) {
2629                 goto invalid;
2630             }
2631         }
2632     }
2633 
2634     status = nvme_check_bounds(ns, iocb->slba, nlb);
2635     if (status) {
2636         goto invalid;
2637     }
2638 
2639     if (ns->params.zoned) {
2640         status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
2641         if (status) {
2642             goto invalid;
2643         }
2644 
2645         iocb->zone->w_ptr += nlb;
2646     }
2647 
2648     qemu_iovec_reset(&iocb->iov);
2649     qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2650 
2651     iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
2652                                   &iocb->iov, 0, nvme_copy_out_cb, iocb);
2653 
2654     return;
2655 
2656 invalid:
2657     req->status = status;
2658     iocb->aiocb = NULL;
2659     if (iocb->bh) {
2660         qemu_bh_schedule(iocb->bh);
2661     }
2662 
2663     return;
2664 
2665 out:
2666     nvme_copy_cb(iocb, ret);
2667 }
2668 
2669 static void nvme_copy_in_cb(void *opaque, int ret)
2670 {
2671     NvmeCopyAIOCB *iocb = opaque;
2672     NvmeRequest *req = iocb->req;
2673     NvmeNamespace *ns = req->ns;
2674     NvmeCopySourceRange *range;
2675     uint64_t slba;
2676     uint32_t nlb;
2677 
2678     if (ret < 0) {
2679         iocb->ret = ret;
2680         goto out;
2681     } else if (iocb->ret < 0) {
2682         goto out;
2683     }
2684 
2685     if (!ns->lbaf.ms) {
2686         nvme_copy_in_completed_cb(iocb, 0);
2687         return;
2688     }
2689 
2690     range = &iocb->ranges[iocb->idx];
2691     slba = le64_to_cpu(range->slba);
2692     nlb = le32_to_cpu(range->nlb) + 1;
2693 
2694     qemu_iovec_reset(&iocb->iov);
2695     qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
2696                    nvme_m2b(ns, nlb));
2697 
2698     iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
2699                                  &iocb->iov, 0, nvme_copy_in_completed_cb,
2700                                  iocb);
2701     return;
2702 
2703 out:
2704     nvme_copy_cb(iocb, iocb->ret);
2705 }
2706 
2707 static void nvme_copy_cb(void *opaque, int ret)
2708 {
2709     NvmeCopyAIOCB *iocb = opaque;
2710     NvmeRequest *req = iocb->req;
2711     NvmeNamespace *ns = req->ns;
2712     NvmeCopySourceRange *range;
2713     uint64_t slba;
2714     uint32_t nlb;
2715     size_t len;
2716     uint16_t status;
2717 
2718     if (ret < 0) {
2719         iocb->ret = ret;
2720         goto done;
2721     } else if (iocb->ret < 0) {
2722         goto done;
2723     }
2724 
2725     if (iocb->idx == iocb->nr) {
2726         goto done;
2727     }
2728 
2729     range = &iocb->ranges[iocb->idx];
2730     slba = le64_to_cpu(range->slba);
2731     nlb = le32_to_cpu(range->nlb) + 1;
2732     len = nvme_l2b(ns, nlb);
2733 
2734     trace_pci_nvme_copy_source_range(slba, nlb);
2735 
2736     if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2737         status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2738         goto invalid;
2739     }
2740 
2741     status = nvme_check_bounds(ns, slba, nlb);
2742     if (status) {
2743         goto invalid;
2744     }
2745 
2746     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2747         status = nvme_check_dulbe(ns, slba, nlb);
2748         if (status) {
2749             goto invalid;
2750         }
2751     }
2752 
2753     if (ns->params.zoned) {
2754         status = nvme_check_zone_read(ns, slba, nlb);
2755         if (status) {
2756             goto invalid;
2757         }
2758     }
2759 
2760     qemu_iovec_reset(&iocb->iov);
2761     qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2762 
2763     iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
2764                                  &iocb->iov, 0, nvme_copy_in_cb, iocb);
2765     return;
2766 
2767 invalid:
2768     req->status = status;
2769 done:
2770     iocb->aiocb = NULL;
2771     if (iocb->bh) {
2772         qemu_bh_schedule(iocb->bh);
2773     }
2774 }
2775 
2776 
2777 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2778 {
2779     NvmeNamespace *ns = req->ns;
2780     NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2781     NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
2782                                       nvme_misc_cb, req);
2783     uint16_t nr = copy->nr + 1;
2784     uint8_t format = copy->control[0] & 0xf;
2785     uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2786     uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2787 
2788     uint16_t status;
2789 
2790     trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2791 
2792     iocb->ranges = NULL;
2793     iocb->zone = NULL;
2794 
2795     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2796         ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
2797         status = NVME_INVALID_FIELD | NVME_DNR;
2798         goto invalid;
2799     }
2800 
2801     if (!(n->id_ctrl.ocfs & (1 << format))) {
2802         trace_pci_nvme_err_copy_invalid_format(format);
2803         status = NVME_INVALID_FIELD | NVME_DNR;
2804         goto invalid;
2805     }
2806 
2807     if (nr > ns->id_ns.msrc + 1) {
2808         status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2809         goto invalid;
2810     }
2811 
2812     iocb->ranges = g_new(NvmeCopySourceRange, nr);
2813 
2814     status = nvme_h2c(n, (uint8_t *)iocb->ranges,
2815                       sizeof(NvmeCopySourceRange) * nr, req);
2816     if (status) {
2817         goto invalid;
2818     }
2819 
2820     iocb->slba = le64_to_cpu(copy->sdlba);
2821 
2822     if (ns->params.zoned) {
2823         iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
2824         if (!iocb->zone) {
2825             status = NVME_LBA_RANGE | NVME_DNR;
2826             goto invalid;
2827         }
2828 
2829         status = nvme_zrm_auto(n, ns, iocb->zone);
2830         if (status) {
2831             goto invalid;
2832         }
2833     }
2834 
2835     iocb->req = req;
2836     iocb->bh = qemu_bh_new(nvme_copy_bh, iocb);
2837     iocb->ret = 0;
2838     iocb->nr = nr;
2839     iocb->idx = 0;
2840     iocb->reftag = le32_to_cpu(copy->reftag);
2841     iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
2842                               ns->lbasz + ns->lbaf.ms);
2843 
2844     qemu_iovec_init(&iocb->iov, 1);
2845 
2846     block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
2847                      BLOCK_ACCT_READ);
2848     block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
2849                      BLOCK_ACCT_WRITE);
2850 
2851     req->aiocb = &iocb->common;
2852     nvme_copy_cb(iocb, 0);
2853 
2854     return NVME_NO_COMPLETE;
2855 
2856 invalid:
2857     g_free(iocb->ranges);
2858     qemu_aio_unref(iocb);
2859     return status;
2860 }
2861 
2862 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
2863 {
2864     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2865     NvmeNamespace *ns = req->ns;
2866     BlockBackend *blk = ns->blkconf.blk;
2867     uint64_t slba = le64_to_cpu(rw->slba);
2868     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2869     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2870     size_t data_len = nvme_l2b(ns, nlb);
2871     size_t len = data_len;
2872     int64_t offset = nvme_l2b(ns, slba);
2873     struct nvme_compare_ctx *ctx = NULL;
2874     uint16_t status;
2875 
2876     trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2877 
2878     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
2879         return NVME_INVALID_PROT_INFO | NVME_DNR;
2880     }
2881 
2882     if (nvme_ns_ext(ns)) {
2883         len += nvme_m2b(ns, nlb);
2884     }
2885 
2886     status = nvme_check_mdts(n, len);
2887     if (status) {
2888         return status;
2889     }
2890 
2891     status = nvme_check_bounds(ns, slba, nlb);
2892     if (status) {
2893         return status;
2894     }
2895 
2896     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2897         status = nvme_check_dulbe(ns, slba, nlb);
2898         if (status) {
2899             return status;
2900         }
2901     }
2902 
2903     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
2904     if (status) {
2905         return status;
2906     }
2907 
2908     ctx = g_new(struct nvme_compare_ctx, 1);
2909     ctx->data.bounce = g_malloc(data_len);
2910 
2911     req->opaque = ctx;
2912 
2913     qemu_iovec_init(&ctx->data.iov, 1);
2914     qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
2915 
2916     block_acct_start(blk_get_stats(blk), &req->acct, data_len,
2917                      BLOCK_ACCT_READ);
2918     req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
2919                                 nvme_compare_data_cb, req);
2920 
2921     return NVME_NO_COMPLETE;
2922 }
2923 
2924 typedef struct NvmeFlushAIOCB {
2925     BlockAIOCB common;
2926     BlockAIOCB *aiocb;
2927     NvmeRequest *req;
2928     QEMUBH *bh;
2929     int ret;
2930 
2931     NvmeNamespace *ns;
2932     uint32_t nsid;
2933     bool broadcast;
2934 } NvmeFlushAIOCB;
2935 
2936 static void nvme_flush_cancel(BlockAIOCB *acb)
2937 {
2938     NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
2939 
2940     iocb->ret = -ECANCELED;
2941 
2942     if (iocb->aiocb) {
2943         blk_aio_cancel_async(iocb->aiocb);
2944     }
2945 }
2946 
2947 static const AIOCBInfo nvme_flush_aiocb_info = {
2948     .aiocb_size = sizeof(NvmeFlushAIOCB),
2949     .cancel_async = nvme_flush_cancel,
2950     .get_aio_context = nvme_get_aio_context,
2951 };
2952 
2953 static void nvme_flush_ns_cb(void *opaque, int ret)
2954 {
2955     NvmeFlushAIOCB *iocb = opaque;
2956     NvmeNamespace *ns = iocb->ns;
2957 
2958     if (ret < 0) {
2959         iocb->ret = ret;
2960         goto out;
2961     } else if (iocb->ret < 0) {
2962         goto out;
2963     }
2964 
2965     if (ns) {
2966         trace_pci_nvme_flush_ns(iocb->nsid);
2967 
2968         iocb->ns = NULL;
2969         iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
2970         return;
2971     }
2972 
2973 out:
2974     iocb->aiocb = NULL;
2975     qemu_bh_schedule(iocb->bh);
2976 }
2977 
2978 static void nvme_flush_bh(void *opaque)
2979 {
2980     NvmeFlushAIOCB *iocb = opaque;
2981     NvmeRequest *req = iocb->req;
2982     NvmeCtrl *n = nvme_ctrl(req);
2983     int i;
2984 
2985     if (iocb->ret < 0) {
2986         goto done;
2987     }
2988 
2989     if (iocb->broadcast) {
2990         for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
2991             iocb->ns = nvme_ns(n, i);
2992             if (iocb->ns) {
2993                 iocb->nsid = i;
2994                 break;
2995             }
2996         }
2997     }
2998 
2999     if (!iocb->ns) {
3000         goto done;
3001     }
3002 
3003     nvme_flush_ns_cb(iocb, 0);
3004     return;
3005 
3006 done:
3007     qemu_bh_delete(iocb->bh);
3008     iocb->bh = NULL;
3009 
3010     iocb->common.cb(iocb->common.opaque, iocb->ret);
3011 
3012     qemu_aio_unref(iocb);
3013 
3014     return;
3015 }
3016 
3017 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3018 {
3019     NvmeFlushAIOCB *iocb;
3020     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3021     uint16_t status;
3022 
3023     iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3024 
3025     iocb->req = req;
3026     iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
3027     iocb->ret = 0;
3028     iocb->ns = NULL;
3029     iocb->nsid = 0;
3030     iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3031 
3032     if (!iocb->broadcast) {
3033         if (!nvme_nsid_valid(n, nsid)) {
3034             status = NVME_INVALID_NSID | NVME_DNR;
3035             goto out;
3036         }
3037 
3038         iocb->ns = nvme_ns(n, nsid);
3039         if (!iocb->ns) {
3040             status = NVME_INVALID_FIELD | NVME_DNR;
3041             goto out;
3042         }
3043 
3044         iocb->nsid = nsid;
3045     }
3046 
3047     req->aiocb = &iocb->common;
3048     qemu_bh_schedule(iocb->bh);
3049 
3050     return NVME_NO_COMPLETE;
3051 
3052 out:
3053     qemu_bh_delete(iocb->bh);
3054     iocb->bh = NULL;
3055     qemu_aio_unref(iocb);
3056 
3057     return status;
3058 }
3059 
3060 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3061 {
3062     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3063     NvmeNamespace *ns = req->ns;
3064     uint64_t slba = le64_to_cpu(rw->slba);
3065     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3066     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3067     uint64_t data_size = nvme_l2b(ns, nlb);
3068     uint64_t mapped_size = data_size;
3069     uint64_t data_offset;
3070     BlockBackend *blk = ns->blkconf.blk;
3071     uint16_t status;
3072 
3073     if (nvme_ns_ext(ns)) {
3074         mapped_size += nvme_m2b(ns, nlb);
3075 
3076         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3077             bool pract = prinfo & NVME_PRINFO_PRACT;
3078 
3079             if (pract && ns->lbaf.ms == 8) {
3080                 mapped_size = data_size;
3081             }
3082         }
3083     }
3084 
3085     trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3086 
3087     status = nvme_check_mdts(n, mapped_size);
3088     if (status) {
3089         goto invalid;
3090     }
3091 
3092     status = nvme_check_bounds(ns, slba, nlb);
3093     if (status) {
3094         goto invalid;
3095     }
3096 
3097     if (ns->params.zoned) {
3098         status = nvme_check_zone_read(ns, slba, nlb);
3099         if (status) {
3100             trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3101             goto invalid;
3102         }
3103     }
3104 
3105     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3106         status = nvme_check_dulbe(ns, slba, nlb);
3107         if (status) {
3108             goto invalid;
3109         }
3110     }
3111 
3112     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3113         return nvme_dif_rw(n, req);
3114     }
3115 
3116     status = nvme_map_data(n, nlb, req);
3117     if (status) {
3118         goto invalid;
3119     }
3120 
3121     data_offset = nvme_l2b(ns, slba);
3122 
3123     block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3124                      BLOCK_ACCT_READ);
3125     nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
3126     return NVME_NO_COMPLETE;
3127 
3128 invalid:
3129     block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3130     return status | NVME_DNR;
3131 }
3132 
3133 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3134                               bool wrz)
3135 {
3136     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3137     NvmeNamespace *ns = req->ns;
3138     uint64_t slba = le64_to_cpu(rw->slba);
3139     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3140     uint16_t ctrl = le16_to_cpu(rw->control);
3141     uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3142     uint64_t data_size = nvme_l2b(ns, nlb);
3143     uint64_t mapped_size = data_size;
3144     uint64_t data_offset;
3145     NvmeZone *zone;
3146     NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3147     BlockBackend *blk = ns->blkconf.blk;
3148     uint16_t status;
3149 
3150     if (nvme_ns_ext(ns)) {
3151         mapped_size += nvme_m2b(ns, nlb);
3152 
3153         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3154             bool pract = prinfo & NVME_PRINFO_PRACT;
3155 
3156             if (pract && ns->lbaf.ms == 8) {
3157                 mapped_size -= nvme_m2b(ns, nlb);
3158             }
3159         }
3160     }
3161 
3162     trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3163                          nvme_nsid(ns), nlb, mapped_size, slba);
3164 
3165     if (!wrz) {
3166         status = nvme_check_mdts(n, mapped_size);
3167         if (status) {
3168             goto invalid;
3169         }
3170     }
3171 
3172     status = nvme_check_bounds(ns, slba, nlb);
3173     if (status) {
3174         goto invalid;
3175     }
3176 
3177     if (ns->params.zoned) {
3178         zone = nvme_get_zone_by_slba(ns, slba);
3179         assert(zone);
3180 
3181         if (append) {
3182             bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3183 
3184             if (unlikely(slba != zone->d.zslba)) {
3185                 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3186                 status = NVME_INVALID_FIELD;
3187                 goto invalid;
3188             }
3189 
3190             if (n->params.zasl &&
3191                 data_size > (uint64_t)n->page_size << n->params.zasl) {
3192                 trace_pci_nvme_err_zasl(data_size);
3193                 return NVME_INVALID_FIELD | NVME_DNR;
3194             }
3195 
3196             slba = zone->w_ptr;
3197             rw->slba = cpu_to_le64(slba);
3198             res->slba = cpu_to_le64(slba);
3199 
3200             switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3201             case NVME_ID_NS_DPS_TYPE_1:
3202                 if (!piremap) {
3203                     return NVME_INVALID_PROT_INFO | NVME_DNR;
3204                 }
3205 
3206                 /* fallthrough */
3207 
3208             case NVME_ID_NS_DPS_TYPE_2:
3209                 if (piremap) {
3210                     uint32_t reftag = le32_to_cpu(rw->reftag);
3211                     rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3212                 }
3213 
3214                 break;
3215 
3216             case NVME_ID_NS_DPS_TYPE_3:
3217                 if (piremap) {
3218                     return NVME_INVALID_PROT_INFO | NVME_DNR;
3219                 }
3220 
3221                 break;
3222             }
3223         }
3224 
3225         status = nvme_check_zone_write(ns, zone, slba, nlb);
3226         if (status) {
3227             goto invalid;
3228         }
3229 
3230         status = nvme_zrm_auto(n, ns, zone);
3231         if (status) {
3232             goto invalid;
3233         }
3234 
3235         zone->w_ptr += nlb;
3236     }
3237 
3238     data_offset = nvme_l2b(ns, slba);
3239 
3240     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3241         return nvme_dif_rw(n, req);
3242     }
3243 
3244     if (!wrz) {
3245         status = nvme_map_data(n, nlb, req);
3246         if (status) {
3247             goto invalid;
3248         }
3249 
3250         block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3251                          BLOCK_ACCT_WRITE);
3252         nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3253     } else {
3254         req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3255                                            BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3256                                            req);
3257     }
3258 
3259     return NVME_NO_COMPLETE;
3260 
3261 invalid:
3262     block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3263     return status | NVME_DNR;
3264 }
3265 
3266 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3267 {
3268     return nvme_do_write(n, req, false, false);
3269 }
3270 
3271 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3272 {
3273     return nvme_do_write(n, req, false, true);
3274 }
3275 
3276 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3277 {
3278     return nvme_do_write(n, req, true, false);
3279 }
3280 
3281 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3282                                             uint64_t *slba, uint32_t *zone_idx)
3283 {
3284     uint32_t dw10 = le32_to_cpu(c->cdw10);
3285     uint32_t dw11 = le32_to_cpu(c->cdw11);
3286 
3287     if (!ns->params.zoned) {
3288         trace_pci_nvme_err_invalid_opc(c->opcode);
3289         return NVME_INVALID_OPCODE | NVME_DNR;
3290     }
3291 
3292     *slba = ((uint64_t)dw11) << 32 | dw10;
3293     if (unlikely(*slba >= ns->id_ns.nsze)) {
3294         trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3295         *slba = 0;
3296         return NVME_LBA_RANGE | NVME_DNR;
3297     }
3298 
3299     *zone_idx = nvme_zone_idx(ns, *slba);
3300     assert(*zone_idx < ns->num_zones);
3301 
3302     return NVME_SUCCESS;
3303 }
3304 
3305 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3306                                  NvmeRequest *);
3307 
3308 enum NvmeZoneProcessingMask {
3309     NVME_PROC_CURRENT_ZONE    = 0,
3310     NVME_PROC_OPENED_ZONES    = 1 << 0,
3311     NVME_PROC_CLOSED_ZONES    = 1 << 1,
3312     NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3313     NVME_PROC_FULL_ZONES      = 1 << 3,
3314 };
3315 
3316 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3317                                NvmeZoneState state, NvmeRequest *req)
3318 {
3319     return nvme_zrm_open(nvme_ctrl(req), ns, zone);
3320 }
3321 
3322 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3323                                 NvmeZoneState state, NvmeRequest *req)
3324 {
3325     return nvme_zrm_close(ns, zone);
3326 }
3327 
3328 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3329                                  NvmeZoneState state, NvmeRequest *req)
3330 {
3331     return nvme_zrm_finish(ns, zone);
3332 }
3333 
3334 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3335                                   NvmeZoneState state, NvmeRequest *req)
3336 {
3337     switch (state) {
3338     case NVME_ZONE_STATE_READ_ONLY:
3339         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3340         /* fall through */
3341     case NVME_ZONE_STATE_OFFLINE:
3342         return NVME_SUCCESS;
3343     default:
3344         return NVME_ZONE_INVAL_TRANSITION;
3345     }
3346 }
3347 
3348 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3349 {
3350     uint16_t status;
3351     uint8_t state = nvme_get_zone_state(zone);
3352 
3353     if (state == NVME_ZONE_STATE_EMPTY) {
3354         status = nvme_aor_check(ns, 1, 0);
3355         if (status) {
3356             return status;
3357         }
3358         nvme_aor_inc_active(ns);
3359         zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3360         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3361         return NVME_SUCCESS;
3362     }
3363 
3364     return NVME_ZONE_INVAL_TRANSITION;
3365 }
3366 
3367 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3368                                     enum NvmeZoneProcessingMask proc_mask,
3369                                     op_handler_t op_hndlr, NvmeRequest *req)
3370 {
3371     uint16_t status = NVME_SUCCESS;
3372     NvmeZoneState zs = nvme_get_zone_state(zone);
3373     bool proc_zone;
3374 
3375     switch (zs) {
3376     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3377     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3378         proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3379         break;
3380     case NVME_ZONE_STATE_CLOSED:
3381         proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3382         break;
3383     case NVME_ZONE_STATE_READ_ONLY:
3384         proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3385         break;
3386     case NVME_ZONE_STATE_FULL:
3387         proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3388         break;
3389     default:
3390         proc_zone = false;
3391     }
3392 
3393     if (proc_zone) {
3394         status = op_hndlr(ns, zone, zs, req);
3395     }
3396 
3397     return status;
3398 }
3399 
3400 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3401                                 enum NvmeZoneProcessingMask proc_mask,
3402                                 op_handler_t op_hndlr, NvmeRequest *req)
3403 {
3404     NvmeZone *next;
3405     uint16_t status = NVME_SUCCESS;
3406     int i;
3407 
3408     if (!proc_mask) {
3409         status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3410     } else {
3411         if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3412             QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3413                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3414                                              req);
3415                 if (status && status != NVME_NO_COMPLETE) {
3416                     goto out;
3417                 }
3418             }
3419         }
3420         if (proc_mask & NVME_PROC_OPENED_ZONES) {
3421             QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3422                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3423                                              req);
3424                 if (status && status != NVME_NO_COMPLETE) {
3425                     goto out;
3426                 }
3427             }
3428 
3429             QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3430                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3431                                              req);
3432                 if (status && status != NVME_NO_COMPLETE) {
3433                     goto out;
3434                 }
3435             }
3436         }
3437         if (proc_mask & NVME_PROC_FULL_ZONES) {
3438             QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3439                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3440                                              req);
3441                 if (status && status != NVME_NO_COMPLETE) {
3442                     goto out;
3443                 }
3444             }
3445         }
3446 
3447         if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3448             for (i = 0; i < ns->num_zones; i++, zone++) {
3449                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3450                                              req);
3451                 if (status && status != NVME_NO_COMPLETE) {
3452                     goto out;
3453                 }
3454             }
3455         }
3456     }
3457 
3458 out:
3459     return status;
3460 }
3461 
3462 typedef struct NvmeZoneResetAIOCB {
3463     BlockAIOCB common;
3464     BlockAIOCB *aiocb;
3465     NvmeRequest *req;
3466     QEMUBH *bh;
3467     int ret;
3468 
3469     bool all;
3470     int idx;
3471     NvmeZone *zone;
3472 } NvmeZoneResetAIOCB;
3473 
3474 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
3475 {
3476     NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
3477     NvmeRequest *req = iocb->req;
3478     NvmeNamespace *ns = req->ns;
3479 
3480     iocb->idx = ns->num_zones;
3481 
3482     iocb->ret = -ECANCELED;
3483 
3484     if (iocb->aiocb) {
3485         blk_aio_cancel_async(iocb->aiocb);
3486         iocb->aiocb = NULL;
3487     }
3488 }
3489 
3490 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
3491     .aiocb_size = sizeof(NvmeZoneResetAIOCB),
3492     .cancel_async = nvme_zone_reset_cancel,
3493 };
3494 
3495 static void nvme_zone_reset_bh(void *opaque)
3496 {
3497     NvmeZoneResetAIOCB *iocb = opaque;
3498 
3499     iocb->common.cb(iocb->common.opaque, iocb->ret);
3500 
3501     qemu_bh_delete(iocb->bh);
3502     iocb->bh = NULL;
3503     qemu_aio_unref(iocb);
3504 }
3505 
3506 static void nvme_zone_reset_cb(void *opaque, int ret);
3507 
3508 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
3509 {
3510     NvmeZoneResetAIOCB *iocb = opaque;
3511     NvmeRequest *req = iocb->req;
3512     NvmeNamespace *ns = req->ns;
3513     int64_t moff;
3514     int count;
3515 
3516     if (ret < 0) {
3517         nvme_zone_reset_cb(iocb, ret);
3518         return;
3519     }
3520 
3521     if (!ns->lbaf.ms) {
3522         nvme_zone_reset_cb(iocb, 0);
3523         return;
3524     }
3525 
3526     moff = nvme_moff(ns, iocb->zone->d.zslba);
3527     count = nvme_m2b(ns, ns->zone_size);
3528 
3529     iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
3530                                         BDRV_REQ_MAY_UNMAP,
3531                                         nvme_zone_reset_cb, iocb);
3532     return;
3533 }
3534 
3535 static void nvme_zone_reset_cb(void *opaque, int ret)
3536 {
3537     NvmeZoneResetAIOCB *iocb = opaque;
3538     NvmeRequest *req = iocb->req;
3539     NvmeNamespace *ns = req->ns;
3540 
3541     if (ret < 0) {
3542         iocb->ret = ret;
3543         goto done;
3544     }
3545 
3546     if (iocb->zone) {
3547         nvme_zrm_reset(ns, iocb->zone);
3548 
3549         if (!iocb->all) {
3550             goto done;
3551         }
3552     }
3553 
3554     while (iocb->idx < ns->num_zones) {
3555         NvmeZone *zone = &ns->zone_array[iocb->idx++];
3556 
3557         switch (nvme_get_zone_state(zone)) {
3558         case NVME_ZONE_STATE_EMPTY:
3559             if (!iocb->all) {
3560                 goto done;
3561             }
3562 
3563             continue;
3564 
3565         case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3566         case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3567         case NVME_ZONE_STATE_CLOSED:
3568         case NVME_ZONE_STATE_FULL:
3569             iocb->zone = zone;
3570             break;
3571 
3572         default:
3573             continue;
3574         }
3575 
3576         trace_pci_nvme_zns_zone_reset(zone->d.zslba);
3577 
3578         iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
3579                                             nvme_l2b(ns, zone->d.zslba),
3580                                             nvme_l2b(ns, ns->zone_size),
3581                                             BDRV_REQ_MAY_UNMAP,
3582                                             nvme_zone_reset_epilogue_cb,
3583                                             iocb);
3584         return;
3585     }
3586 
3587 done:
3588     iocb->aiocb = NULL;
3589     if (iocb->bh) {
3590         qemu_bh_schedule(iocb->bh);
3591     }
3592 }
3593 
3594 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3595 {
3596     NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3597     NvmeNamespace *ns = req->ns;
3598     NvmeZone *zone;
3599     NvmeZoneResetAIOCB *iocb;
3600     uint8_t *zd_ext;
3601     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3602     uint64_t slba = 0;
3603     uint32_t zone_idx = 0;
3604     uint16_t status;
3605     uint8_t action;
3606     bool all;
3607     enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3608 
3609     action = dw13 & 0xff;
3610     all = !!(dw13 & 0x100);
3611 
3612     req->status = NVME_SUCCESS;
3613 
3614     if (!all) {
3615         status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3616         if (status) {
3617             return status;
3618         }
3619     }
3620 
3621     zone = &ns->zone_array[zone_idx];
3622     if (slba != zone->d.zslba) {
3623         trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3624         return NVME_INVALID_FIELD | NVME_DNR;
3625     }
3626 
3627     switch (action) {
3628 
3629     case NVME_ZONE_ACTION_OPEN:
3630         if (all) {
3631             proc_mask = NVME_PROC_CLOSED_ZONES;
3632         }
3633         trace_pci_nvme_open_zone(slba, zone_idx, all);
3634         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3635         break;
3636 
3637     case NVME_ZONE_ACTION_CLOSE:
3638         if (all) {
3639             proc_mask = NVME_PROC_OPENED_ZONES;
3640         }
3641         trace_pci_nvme_close_zone(slba, zone_idx, all);
3642         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3643         break;
3644 
3645     case NVME_ZONE_ACTION_FINISH:
3646         if (all) {
3647             proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3648         }
3649         trace_pci_nvme_finish_zone(slba, zone_idx, all);
3650         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3651         break;
3652 
3653     case NVME_ZONE_ACTION_RESET:
3654         trace_pci_nvme_reset_zone(slba, zone_idx, all);
3655 
3656         iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
3657                            nvme_misc_cb, req);
3658 
3659         iocb->req = req;
3660         iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb);
3661         iocb->ret = 0;
3662         iocb->all = all;
3663         iocb->idx = zone_idx;
3664         iocb->zone = NULL;
3665 
3666         req->aiocb = &iocb->common;
3667         nvme_zone_reset_cb(iocb, 0);
3668 
3669         return NVME_NO_COMPLETE;
3670 
3671     case NVME_ZONE_ACTION_OFFLINE:
3672         if (all) {
3673             proc_mask = NVME_PROC_READ_ONLY_ZONES;
3674         }
3675         trace_pci_nvme_offline_zone(slba, zone_idx, all);
3676         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3677         break;
3678 
3679     case NVME_ZONE_ACTION_SET_ZD_EXT:
3680         trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3681         if (all || !ns->params.zd_extension_size) {
3682             return NVME_INVALID_FIELD | NVME_DNR;
3683         }
3684         zd_ext = nvme_get_zd_extension(ns, zone_idx);
3685         status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3686         if (status) {
3687             trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3688             return status;
3689         }
3690 
3691         status = nvme_set_zd_ext(ns, zone);
3692         if (status == NVME_SUCCESS) {
3693             trace_pci_nvme_zd_extension_set(zone_idx);
3694             return status;
3695         }
3696         break;
3697 
3698     default:
3699         trace_pci_nvme_err_invalid_mgmt_action(action);
3700         status = NVME_INVALID_FIELD;
3701     }
3702 
3703     if (status == NVME_ZONE_INVAL_TRANSITION) {
3704         trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
3705                                                          zone->d.za);
3706     }
3707     if (status) {
3708         status |= NVME_DNR;
3709     }
3710 
3711     return status;
3712 }
3713 
3714 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
3715 {
3716     NvmeZoneState zs = nvme_get_zone_state(zl);
3717 
3718     switch (zafs) {
3719     case NVME_ZONE_REPORT_ALL:
3720         return true;
3721     case NVME_ZONE_REPORT_EMPTY:
3722         return zs == NVME_ZONE_STATE_EMPTY;
3723     case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
3724         return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
3725     case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
3726         return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
3727     case NVME_ZONE_REPORT_CLOSED:
3728         return zs == NVME_ZONE_STATE_CLOSED;
3729     case NVME_ZONE_REPORT_FULL:
3730         return zs == NVME_ZONE_STATE_FULL;
3731     case NVME_ZONE_REPORT_READ_ONLY:
3732         return zs == NVME_ZONE_STATE_READ_ONLY;
3733     case NVME_ZONE_REPORT_OFFLINE:
3734         return zs == NVME_ZONE_STATE_OFFLINE;
3735     default:
3736         return false;
3737     }
3738 }
3739 
3740 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
3741 {
3742     NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3743     NvmeNamespace *ns = req->ns;
3744     /* cdw12 is zero-based number of dwords to return. Convert to bytes */
3745     uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
3746     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3747     uint32_t zone_idx, zra, zrasf, partial;
3748     uint64_t max_zones, nr_zones = 0;
3749     uint16_t status;
3750     uint64_t slba;
3751     NvmeZoneDescr *z;
3752     NvmeZone *zone;
3753     NvmeZoneReportHeader *header;
3754     void *buf, *buf_p;
3755     size_t zone_entry_sz;
3756     int i;
3757 
3758     req->status = NVME_SUCCESS;
3759 
3760     status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3761     if (status) {
3762         return status;
3763     }
3764 
3765     zra = dw13 & 0xff;
3766     if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
3767         return NVME_INVALID_FIELD | NVME_DNR;
3768     }
3769     if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
3770         return NVME_INVALID_FIELD | NVME_DNR;
3771     }
3772 
3773     zrasf = (dw13 >> 8) & 0xff;
3774     if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
3775         return NVME_INVALID_FIELD | NVME_DNR;
3776     }
3777 
3778     if (data_size < sizeof(NvmeZoneReportHeader)) {
3779         return NVME_INVALID_FIELD | NVME_DNR;
3780     }
3781 
3782     status = nvme_check_mdts(n, data_size);
3783     if (status) {
3784         return status;
3785     }
3786 
3787     partial = (dw13 >> 16) & 0x01;
3788 
3789     zone_entry_sz = sizeof(NvmeZoneDescr);
3790     if (zra == NVME_ZONE_REPORT_EXTENDED) {
3791         zone_entry_sz += ns->params.zd_extension_size;
3792     }
3793 
3794     max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
3795     buf = g_malloc0(data_size);
3796 
3797     zone = &ns->zone_array[zone_idx];
3798     for (i = zone_idx; i < ns->num_zones; i++) {
3799         if (partial && nr_zones >= max_zones) {
3800             break;
3801         }
3802         if (nvme_zone_matches_filter(zrasf, zone++)) {
3803             nr_zones++;
3804         }
3805     }
3806     header = (NvmeZoneReportHeader *)buf;
3807     header->nr_zones = cpu_to_le64(nr_zones);
3808 
3809     buf_p = buf + sizeof(NvmeZoneReportHeader);
3810     for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
3811         zone = &ns->zone_array[zone_idx];
3812         if (nvme_zone_matches_filter(zrasf, zone)) {
3813             z = (NvmeZoneDescr *)buf_p;
3814             buf_p += sizeof(NvmeZoneDescr);
3815 
3816             z->zt = zone->d.zt;
3817             z->zs = zone->d.zs;
3818             z->zcap = cpu_to_le64(zone->d.zcap);
3819             z->zslba = cpu_to_le64(zone->d.zslba);
3820             z->za = zone->d.za;
3821 
3822             if (nvme_wp_is_valid(zone)) {
3823                 z->wp = cpu_to_le64(zone->d.wp);
3824             } else {
3825                 z->wp = cpu_to_le64(~0ULL);
3826             }
3827 
3828             if (zra == NVME_ZONE_REPORT_EXTENDED) {
3829                 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
3830                     memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
3831                            ns->params.zd_extension_size);
3832                 }
3833                 buf_p += ns->params.zd_extension_size;
3834             }
3835 
3836             max_zones--;
3837         }
3838     }
3839 
3840     status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
3841 
3842     g_free(buf);
3843 
3844     return status;
3845 }
3846 
3847 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
3848 {
3849     NvmeNamespace *ns;
3850     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3851 
3852     trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
3853                           req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
3854 
3855     if (!nvme_nsid_valid(n, nsid)) {
3856         return NVME_INVALID_NSID | NVME_DNR;
3857     }
3858 
3859     /*
3860      * In the base NVM command set, Flush may apply to all namespaces
3861      * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
3862      * along with TP 4056 (Namespace Types), it may be pretty screwed up.
3863      *
3864      * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
3865      * opcode with a specific command since we cannot determine a unique I/O
3866      * command set. Opcode 0h could have any other meaning than something
3867      * equivalent to flushing and say it DOES have completely different
3868      * semantics in some other command set - does an NSID of FFFFFFFFh then
3869      * mean "for all namespaces, apply whatever command set specific command
3870      * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
3871      * whatever command that uses the 0h opcode if, and only if, it allows NSID
3872      * to be FFFFFFFFh"?
3873      *
3874      * Anyway (and luckily), for now, we do not care about this since the
3875      * device only supports namespace types that includes the NVM Flush command
3876      * (NVM and Zoned), so always do an NVM Flush.
3877      */
3878     if (req->cmd.opcode == NVME_CMD_FLUSH) {
3879         return nvme_flush(n, req);
3880     }
3881 
3882     ns = nvme_ns(n, nsid);
3883     if (unlikely(!ns)) {
3884         return NVME_INVALID_FIELD | NVME_DNR;
3885     }
3886 
3887     if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
3888         trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
3889         return NVME_INVALID_OPCODE | NVME_DNR;
3890     }
3891 
3892     if (ns->status) {
3893         return ns->status;
3894     }
3895 
3896     if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
3897         return NVME_INVALID_FIELD;
3898     }
3899 
3900     req->ns = ns;
3901 
3902     switch (req->cmd.opcode) {
3903     case NVME_CMD_WRITE_ZEROES:
3904         return nvme_write_zeroes(n, req);
3905     case NVME_CMD_ZONE_APPEND:
3906         return nvme_zone_append(n, req);
3907     case NVME_CMD_WRITE:
3908         return nvme_write(n, req);
3909     case NVME_CMD_READ:
3910         return nvme_read(n, req);
3911     case NVME_CMD_COMPARE:
3912         return nvme_compare(n, req);
3913     case NVME_CMD_DSM:
3914         return nvme_dsm(n, req);
3915     case NVME_CMD_VERIFY:
3916         return nvme_verify(n, req);
3917     case NVME_CMD_COPY:
3918         return nvme_copy(n, req);
3919     case NVME_CMD_ZONE_MGMT_SEND:
3920         return nvme_zone_mgmt_send(n, req);
3921     case NVME_CMD_ZONE_MGMT_RECV:
3922         return nvme_zone_mgmt_recv(n, req);
3923     default:
3924         assert(false);
3925     }
3926 
3927     return NVME_INVALID_OPCODE | NVME_DNR;
3928 }
3929 
3930 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
3931 {
3932     n->sq[sq->sqid] = NULL;
3933     timer_free(sq->timer);
3934     g_free(sq->io_req);
3935     if (sq->sqid) {
3936         g_free(sq);
3937     }
3938 }
3939 
3940 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
3941 {
3942     NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
3943     NvmeRequest *r, *next;
3944     NvmeSQueue *sq;
3945     NvmeCQueue *cq;
3946     uint16_t qid = le16_to_cpu(c->qid);
3947 
3948     if (unlikely(!qid || nvme_check_sqid(n, qid))) {
3949         trace_pci_nvme_err_invalid_del_sq(qid);
3950         return NVME_INVALID_QID | NVME_DNR;
3951     }
3952 
3953     trace_pci_nvme_del_sq(qid);
3954 
3955     sq = n->sq[qid];
3956     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
3957         r = QTAILQ_FIRST(&sq->out_req_list);
3958         assert(r->aiocb);
3959         blk_aio_cancel(r->aiocb);
3960     }
3961 
3962     assert(QTAILQ_EMPTY(&sq->out_req_list));
3963 
3964     if (!nvme_check_cqid(n, sq->cqid)) {
3965         cq = n->cq[sq->cqid];
3966         QTAILQ_REMOVE(&cq->sq_list, sq, entry);
3967 
3968         nvme_post_cqes(cq);
3969         QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
3970             if (r->sq == sq) {
3971                 QTAILQ_REMOVE(&cq->req_list, r, entry);
3972                 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
3973             }
3974         }
3975     }
3976 
3977     nvme_free_sq(sq, n);
3978     return NVME_SUCCESS;
3979 }
3980 
3981 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
3982                          uint16_t sqid, uint16_t cqid, uint16_t size)
3983 {
3984     int i;
3985     NvmeCQueue *cq;
3986 
3987     sq->ctrl = n;
3988     sq->dma_addr = dma_addr;
3989     sq->sqid = sqid;
3990     sq->size = size;
3991     sq->cqid = cqid;
3992     sq->head = sq->tail = 0;
3993     sq->io_req = g_new0(NvmeRequest, sq->size);
3994 
3995     QTAILQ_INIT(&sq->req_list);
3996     QTAILQ_INIT(&sq->out_req_list);
3997     for (i = 0; i < sq->size; i++) {
3998         sq->io_req[i].sq = sq;
3999         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4000     }
4001     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
4002 
4003     assert(n->cq[cqid]);
4004     cq = n->cq[cqid];
4005     QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4006     n->sq[sqid] = sq;
4007 }
4008 
4009 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4010 {
4011     NvmeSQueue *sq;
4012     NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4013 
4014     uint16_t cqid = le16_to_cpu(c->cqid);
4015     uint16_t sqid = le16_to_cpu(c->sqid);
4016     uint16_t qsize = le16_to_cpu(c->qsize);
4017     uint16_t qflags = le16_to_cpu(c->sq_flags);
4018     uint64_t prp1 = le64_to_cpu(c->prp1);
4019 
4020     trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4021 
4022     if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4023         trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4024         return NVME_INVALID_CQID | NVME_DNR;
4025     }
4026     if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
4027         n->sq[sqid] != NULL)) {
4028         trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4029         return NVME_INVALID_QID | NVME_DNR;
4030     }
4031     if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4032         trace_pci_nvme_err_invalid_create_sq_size(qsize);
4033         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4034     }
4035     if (unlikely(prp1 & (n->page_size - 1))) {
4036         trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4037         return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4038     }
4039     if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4040         trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4041         return NVME_INVALID_FIELD | NVME_DNR;
4042     }
4043     sq = g_malloc0(sizeof(*sq));
4044     nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4045     return NVME_SUCCESS;
4046 }
4047 
4048 struct nvme_stats {
4049     uint64_t units_read;
4050     uint64_t units_written;
4051     uint64_t read_commands;
4052     uint64_t write_commands;
4053 };
4054 
4055 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4056 {
4057     BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4058 
4059     stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
4060     stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
4061     stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4062     stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4063 }
4064 
4065 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4066                                 uint64_t off, NvmeRequest *req)
4067 {
4068     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4069     struct nvme_stats stats = { 0 };
4070     NvmeSmartLog smart = { 0 };
4071     uint32_t trans_len;
4072     NvmeNamespace *ns;
4073     time_t current_ms;
4074 
4075     if (off >= sizeof(smart)) {
4076         return NVME_INVALID_FIELD | NVME_DNR;
4077     }
4078 
4079     if (nsid != 0xffffffff) {
4080         ns = nvme_ns(n, nsid);
4081         if (!ns) {
4082             return NVME_INVALID_NSID | NVME_DNR;
4083         }
4084         nvme_set_blk_stats(ns, &stats);
4085     } else {
4086         int i;
4087 
4088         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4089             ns = nvme_ns(n, i);
4090             if (!ns) {
4091                 continue;
4092             }
4093             nvme_set_blk_stats(ns, &stats);
4094         }
4095     }
4096 
4097     trans_len = MIN(sizeof(smart) - off, buf_len);
4098     smart.critical_warning = n->smart_critical_warning;
4099 
4100     smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
4101                                                         1000));
4102     smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
4103                                                            1000));
4104     smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4105     smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4106 
4107     smart.temperature = cpu_to_le16(n->temperature);
4108 
4109     if ((n->temperature >= n->features.temp_thresh_hi) ||
4110         (n->temperature <= n->features.temp_thresh_low)) {
4111         smart.critical_warning |= NVME_SMART_TEMPERATURE;
4112     }
4113 
4114     current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4115     smart.power_on_hours[0] =
4116         cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4117 
4118     if (!rae) {
4119         nvme_clear_events(n, NVME_AER_TYPE_SMART);
4120     }
4121 
4122     return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4123 }
4124 
4125 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4126                                  NvmeRequest *req)
4127 {
4128     uint32_t trans_len;
4129     NvmeFwSlotInfoLog fw_log = {
4130         .afi = 0x1,
4131     };
4132 
4133     if (off >= sizeof(fw_log)) {
4134         return NVME_INVALID_FIELD | NVME_DNR;
4135     }
4136 
4137     strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
4138     trans_len = MIN(sizeof(fw_log) - off, buf_len);
4139 
4140     return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
4141 }
4142 
4143 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4144                                 uint64_t off, NvmeRequest *req)
4145 {
4146     uint32_t trans_len;
4147     NvmeErrorLog errlog;
4148 
4149     if (off >= sizeof(errlog)) {
4150         return NVME_INVALID_FIELD | NVME_DNR;
4151     }
4152 
4153     if (!rae) {
4154         nvme_clear_events(n, NVME_AER_TYPE_ERROR);
4155     }
4156 
4157     memset(&errlog, 0x0, sizeof(errlog));
4158     trans_len = MIN(sizeof(errlog) - off, buf_len);
4159 
4160     return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
4161 }
4162 
4163 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4164                                     uint64_t off, NvmeRequest *req)
4165 {
4166     uint32_t nslist[1024];
4167     uint32_t trans_len;
4168     int i = 0;
4169     uint32_t nsid;
4170 
4171     memset(nslist, 0x0, sizeof(nslist));
4172     trans_len = MIN(sizeof(nslist) - off, buf_len);
4173 
4174     while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
4175             NVME_CHANGED_NSID_SIZE) {
4176         /*
4177          * If more than 1024 namespaces, the first entry in the log page should
4178          * be set to FFFFFFFFh and the others to 0 as spec.
4179          */
4180         if (i == ARRAY_SIZE(nslist)) {
4181             memset(nslist, 0x0, sizeof(nslist));
4182             nslist[0] = 0xffffffff;
4183             break;
4184         }
4185 
4186         nslist[i++] = nsid;
4187         clear_bit(nsid, n->changed_nsids);
4188     }
4189 
4190     /*
4191      * Remove all the remaining list entries in case returns directly due to
4192      * more than 1024 namespaces.
4193      */
4194     if (nslist[0] == 0xffffffff) {
4195         bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
4196     }
4197 
4198     if (!rae) {
4199         nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
4200     }
4201 
4202     return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
4203 }
4204 
4205 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
4206                                  uint64_t off, NvmeRequest *req)
4207 {
4208     NvmeEffectsLog log = {};
4209     const uint32_t *src_iocs = NULL;
4210     uint32_t trans_len;
4211 
4212     if (off >= sizeof(log)) {
4213         trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
4214         return NVME_INVALID_FIELD | NVME_DNR;
4215     }
4216 
4217     switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
4218     case NVME_CC_CSS_NVM:
4219         src_iocs = nvme_cse_iocs_nvm;
4220         /* fall through */
4221     case NVME_CC_CSS_ADMIN_ONLY:
4222         break;
4223     case NVME_CC_CSS_CSI:
4224         switch (csi) {
4225         case NVME_CSI_NVM:
4226             src_iocs = nvme_cse_iocs_nvm;
4227             break;
4228         case NVME_CSI_ZONED:
4229             src_iocs = nvme_cse_iocs_zoned;
4230             break;
4231         }
4232     }
4233 
4234     memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4235 
4236     if (src_iocs) {
4237         memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4238     }
4239 
4240     trans_len = MIN(sizeof(log) - off, buf_len);
4241 
4242     return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4243 }
4244 
4245 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4246 {
4247     NvmeCmd *cmd = &req->cmd;
4248 
4249     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4250     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4251     uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4252     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4253     uint8_t  lid = dw10 & 0xff;
4254     uint8_t  lsp = (dw10 >> 8) & 0xf;
4255     uint8_t  rae = (dw10 >> 15) & 0x1;
4256     uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
4257     uint32_t numdl, numdu;
4258     uint64_t off, lpol, lpou;
4259     size_t   len;
4260     uint16_t status;
4261 
4262     numdl = (dw10 >> 16);
4263     numdu = (dw11 & 0xffff);
4264     lpol = dw12;
4265     lpou = dw13;
4266 
4267     len = (((numdu << 16) | numdl) + 1) << 2;
4268     off = (lpou << 32ULL) | lpol;
4269 
4270     if (off & 0x3) {
4271         return NVME_INVALID_FIELD | NVME_DNR;
4272     }
4273 
4274     trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4275 
4276     status = nvme_check_mdts(n, len);
4277     if (status) {
4278         return status;
4279     }
4280 
4281     switch (lid) {
4282     case NVME_LOG_ERROR_INFO:
4283         return nvme_error_info(n, rae, len, off, req);
4284     case NVME_LOG_SMART_INFO:
4285         return nvme_smart_info(n, rae, len, off, req);
4286     case NVME_LOG_FW_SLOT_INFO:
4287         return nvme_fw_log_info(n, len, off, req);
4288     case NVME_LOG_CHANGED_NSLIST:
4289         return nvme_changed_nslist(n, rae, len, off, req);
4290     case NVME_LOG_CMD_EFFECTS:
4291         return nvme_cmd_effects(n, csi, len, off, req);
4292     default:
4293         trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4294         return NVME_INVALID_FIELD | NVME_DNR;
4295     }
4296 }
4297 
4298 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4299 {
4300     n->cq[cq->cqid] = NULL;
4301     timer_free(cq->timer);
4302     if (msix_enabled(&n->parent_obj)) {
4303         msix_vector_unuse(&n->parent_obj, cq->vector);
4304     }
4305     if (cq->cqid) {
4306         g_free(cq);
4307     }
4308 }
4309 
4310 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4311 {
4312     NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4313     NvmeCQueue *cq;
4314     uint16_t qid = le16_to_cpu(c->qid);
4315 
4316     if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4317         trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4318         return NVME_INVALID_CQID | NVME_DNR;
4319     }
4320 
4321     cq = n->cq[qid];
4322     if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4323         trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4324         return NVME_INVALID_QUEUE_DEL;
4325     }
4326 
4327     if (cq->irq_enabled && cq->tail != cq->head) {
4328         n->cq_pending--;
4329     }
4330 
4331     nvme_irq_deassert(n, cq);
4332     trace_pci_nvme_del_cq(qid);
4333     nvme_free_cq(cq, n);
4334     return NVME_SUCCESS;
4335 }
4336 
4337 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4338                          uint16_t cqid, uint16_t vector, uint16_t size,
4339                          uint16_t irq_enabled)
4340 {
4341     int ret;
4342 
4343     if (msix_enabled(&n->parent_obj)) {
4344         ret = msix_vector_use(&n->parent_obj, vector);
4345         assert(ret == 0);
4346     }
4347     cq->ctrl = n;
4348     cq->cqid = cqid;
4349     cq->size = size;
4350     cq->dma_addr = dma_addr;
4351     cq->phase = 1;
4352     cq->irq_enabled = irq_enabled;
4353     cq->vector = vector;
4354     cq->head = cq->tail = 0;
4355     QTAILQ_INIT(&cq->req_list);
4356     QTAILQ_INIT(&cq->sq_list);
4357     n->cq[cqid] = cq;
4358     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4359 }
4360 
4361 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4362 {
4363     NvmeCQueue *cq;
4364     NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4365     uint16_t cqid = le16_to_cpu(c->cqid);
4366     uint16_t vector = le16_to_cpu(c->irq_vector);
4367     uint16_t qsize = le16_to_cpu(c->qsize);
4368     uint16_t qflags = le16_to_cpu(c->cq_flags);
4369     uint64_t prp1 = le64_to_cpu(c->prp1);
4370 
4371     trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4372                              NVME_CQ_FLAGS_IEN(qflags) != 0);
4373 
4374     if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
4375         n->cq[cqid] != NULL)) {
4376         trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4377         return NVME_INVALID_QID | NVME_DNR;
4378     }
4379     if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4380         trace_pci_nvme_err_invalid_create_cq_size(qsize);
4381         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4382     }
4383     if (unlikely(prp1 & (n->page_size - 1))) {
4384         trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4385         return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4386     }
4387     if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4388         trace_pci_nvme_err_invalid_create_cq_vector(vector);
4389         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4390     }
4391     if (unlikely(vector >= n->params.msix_qsize)) {
4392         trace_pci_nvme_err_invalid_create_cq_vector(vector);
4393         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4394     }
4395     if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4396         trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4397         return NVME_INVALID_FIELD | NVME_DNR;
4398     }
4399 
4400     cq = g_malloc0(sizeof(*cq));
4401     nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4402                  NVME_CQ_FLAGS_IEN(qflags));
4403 
4404     /*
4405      * It is only required to set qs_created when creating a completion queue;
4406      * creating a submission queue without a matching completion queue will
4407      * fail.
4408      */
4409     n->qs_created = true;
4410     return NVME_SUCCESS;
4411 }
4412 
4413 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4414 {
4415     uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4416 
4417     return nvme_c2h(n, id, sizeof(id), req);
4418 }
4419 
4420 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4421 {
4422     trace_pci_nvme_identify_ctrl();
4423 
4424     return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4425 }
4426 
4427 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4428 {
4429     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4430     uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4431     NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4432 
4433     trace_pci_nvme_identify_ctrl_csi(c->csi);
4434 
4435     switch (c->csi) {
4436     case NVME_CSI_NVM:
4437         id_nvm->vsl = n->params.vsl;
4438         id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4439         break;
4440 
4441     case NVME_CSI_ZONED:
4442         ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4443         break;
4444 
4445     default:
4446         return NVME_INVALID_FIELD | NVME_DNR;
4447     }
4448 
4449     return nvme_c2h(n, id, sizeof(id), req);
4450 }
4451 
4452 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4453 {
4454     NvmeNamespace *ns;
4455     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4456     uint32_t nsid = le32_to_cpu(c->nsid);
4457 
4458     trace_pci_nvme_identify_ns(nsid);
4459 
4460     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4461         return NVME_INVALID_NSID | NVME_DNR;
4462     }
4463 
4464     ns = nvme_ns(n, nsid);
4465     if (unlikely(!ns)) {
4466         if (!active) {
4467             ns = nvme_subsys_ns(n->subsys, nsid);
4468             if (!ns) {
4469                 return nvme_rpt_empty_id_struct(n, req);
4470             }
4471         } else {
4472             return nvme_rpt_empty_id_struct(n, req);
4473         }
4474     }
4475 
4476     if (active || ns->csi == NVME_CSI_NVM) {
4477         return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4478     }
4479 
4480     return NVME_INVALID_CMD_SET | NVME_DNR;
4481 }
4482 
4483 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
4484                                         bool attached)
4485 {
4486     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4487     uint32_t nsid = le32_to_cpu(c->nsid);
4488     uint16_t min_id = le16_to_cpu(c->ctrlid);
4489     uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4490     uint16_t *ids = &list[1];
4491     NvmeNamespace *ns;
4492     NvmeCtrl *ctrl;
4493     int cntlid, nr_ids = 0;
4494 
4495     trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
4496 
4497     if (!n->subsys) {
4498         return NVME_INVALID_FIELD | NVME_DNR;
4499     }
4500 
4501     if (attached) {
4502         if (nsid == NVME_NSID_BROADCAST) {
4503             return NVME_INVALID_FIELD | NVME_DNR;
4504         }
4505 
4506         ns = nvme_subsys_ns(n->subsys, nsid);
4507         if (!ns) {
4508             return NVME_INVALID_FIELD | NVME_DNR;
4509         }
4510     }
4511 
4512     for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4513         ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4514         if (!ctrl) {
4515             continue;
4516         }
4517 
4518         if (attached && !nvme_ns(ctrl, nsid)) {
4519             continue;
4520         }
4521 
4522         ids[nr_ids++] = cntlid;
4523     }
4524 
4525     list[0] = nr_ids;
4526 
4527     return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4528 }
4529 
4530 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4531                                      bool active)
4532 {
4533     NvmeNamespace *ns;
4534     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4535     uint32_t nsid = le32_to_cpu(c->nsid);
4536 
4537     trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4538 
4539     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4540         return NVME_INVALID_NSID | NVME_DNR;
4541     }
4542 
4543     ns = nvme_ns(n, nsid);
4544     if (unlikely(!ns)) {
4545         if (!active) {
4546             ns = nvme_subsys_ns(n->subsys, nsid);
4547             if (!ns) {
4548                 return nvme_rpt_empty_id_struct(n, req);
4549             }
4550         } else {
4551             return nvme_rpt_empty_id_struct(n, req);
4552         }
4553     }
4554 
4555     if (c->csi == NVME_CSI_NVM) {
4556         return nvme_rpt_empty_id_struct(n, req);
4557     } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
4558         return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
4559                         req);
4560     }
4561 
4562     return NVME_INVALID_FIELD | NVME_DNR;
4563 }
4564 
4565 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
4566                                      bool active)
4567 {
4568     NvmeNamespace *ns;
4569     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4570     uint32_t min_nsid = le32_to_cpu(c->nsid);
4571     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4572     static const int data_len = sizeof(list);
4573     uint32_t *list_ptr = (uint32_t *)list;
4574     int i, j = 0;
4575 
4576     trace_pci_nvme_identify_nslist(min_nsid);
4577 
4578     /*
4579      * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
4580      * since the Active Namespace ID List should return namespaces with ids
4581      * *higher* than the NSID specified in the command. This is also specified
4582      * in the spec (NVM Express v1.3d, Section 5.15.4).
4583      */
4584     if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4585         return NVME_INVALID_NSID | NVME_DNR;
4586     }
4587 
4588     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4589         ns = nvme_ns(n, i);
4590         if (!ns) {
4591             if (!active) {
4592                 ns = nvme_subsys_ns(n->subsys, i);
4593                 if (!ns) {
4594                     continue;
4595                 }
4596             } else {
4597                 continue;
4598             }
4599         }
4600         if (ns->params.nsid <= min_nsid) {
4601             continue;
4602         }
4603         list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4604         if (j == data_len / sizeof(uint32_t)) {
4605             break;
4606         }
4607     }
4608 
4609     return nvme_c2h(n, list, data_len, req);
4610 }
4611 
4612 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
4613                                          bool active)
4614 {
4615     NvmeNamespace *ns;
4616     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4617     uint32_t min_nsid = le32_to_cpu(c->nsid);
4618     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4619     static const int data_len = sizeof(list);
4620     uint32_t *list_ptr = (uint32_t *)list;
4621     int i, j = 0;
4622 
4623     trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
4624 
4625     /*
4626      * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
4627      */
4628     if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4629         return NVME_INVALID_NSID | NVME_DNR;
4630     }
4631 
4632     if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
4633         return NVME_INVALID_FIELD | NVME_DNR;
4634     }
4635 
4636     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4637         ns = nvme_ns(n, i);
4638         if (!ns) {
4639             if (!active) {
4640                 ns = nvme_subsys_ns(n->subsys, i);
4641                 if (!ns) {
4642                     continue;
4643                 }
4644             } else {
4645                 continue;
4646             }
4647         }
4648         if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
4649             continue;
4650         }
4651         list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4652         if (j == data_len / sizeof(uint32_t)) {
4653             break;
4654         }
4655     }
4656 
4657     return nvme_c2h(n, list, data_len, req);
4658 }
4659 
4660 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
4661 {
4662     NvmeNamespace *ns;
4663     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4664     uint32_t nsid = le32_to_cpu(c->nsid);
4665     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4666     uint8_t *pos = list;
4667     struct {
4668         NvmeIdNsDescr hdr;
4669         uint8_t v[NVME_NIDL_UUID];
4670     } QEMU_PACKED uuid = {};
4671     struct {
4672         NvmeIdNsDescr hdr;
4673         uint64_t v;
4674     } QEMU_PACKED eui64 = {};
4675     struct {
4676         NvmeIdNsDescr hdr;
4677         uint8_t v;
4678     } QEMU_PACKED csi = {};
4679 
4680     trace_pci_nvme_identify_ns_descr_list(nsid);
4681 
4682     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4683         return NVME_INVALID_NSID | NVME_DNR;
4684     }
4685 
4686     ns = nvme_ns(n, nsid);
4687     if (unlikely(!ns)) {
4688         return NVME_INVALID_FIELD | NVME_DNR;
4689     }
4690 
4691     /*
4692      * If the EUI-64 field is 0 and the NGUID field is 0, the namespace must
4693      * provide a valid Namespace UUID in the Namespace Identification Descriptor
4694      * data structure. QEMU does not yet support setting NGUID.
4695      */
4696     uuid.hdr.nidt = NVME_NIDT_UUID;
4697     uuid.hdr.nidl = NVME_NIDL_UUID;
4698     memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
4699     memcpy(pos, &uuid, sizeof(uuid));
4700     pos += sizeof(uuid);
4701 
4702     if (ns->params.eui64) {
4703         eui64.hdr.nidt = NVME_NIDT_EUI64;
4704         eui64.hdr.nidl = NVME_NIDL_EUI64;
4705         eui64.v = cpu_to_be64(ns->params.eui64);
4706         memcpy(pos, &eui64, sizeof(eui64));
4707         pos += sizeof(eui64);
4708     }
4709 
4710     csi.hdr.nidt = NVME_NIDT_CSI;
4711     csi.hdr.nidl = NVME_NIDL_CSI;
4712     csi.v = ns->csi;
4713     memcpy(pos, &csi, sizeof(csi));
4714     pos += sizeof(csi);
4715 
4716     return nvme_c2h(n, list, sizeof(list), req);
4717 }
4718 
4719 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
4720 {
4721     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4722     static const int data_len = sizeof(list);
4723 
4724     trace_pci_nvme_identify_cmd_set();
4725 
4726     NVME_SET_CSI(*list, NVME_CSI_NVM);
4727     NVME_SET_CSI(*list, NVME_CSI_ZONED);
4728 
4729     return nvme_c2h(n, list, data_len, req);
4730 }
4731 
4732 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
4733 {
4734     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4735 
4736     trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
4737                             c->csi);
4738 
4739     switch (c->cns) {
4740     case NVME_ID_CNS_NS:
4741         return nvme_identify_ns(n, req, true);
4742     case NVME_ID_CNS_NS_PRESENT:
4743         return nvme_identify_ns(n, req, false);
4744     case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
4745         return nvme_identify_ctrl_list(n, req, true);
4746     case NVME_ID_CNS_CTRL_LIST:
4747         return nvme_identify_ctrl_list(n, req, false);
4748     case NVME_ID_CNS_CS_NS:
4749         return nvme_identify_ns_csi(n, req, true);
4750     case NVME_ID_CNS_CS_NS_PRESENT:
4751         return nvme_identify_ns_csi(n, req, false);
4752     case NVME_ID_CNS_CTRL:
4753         return nvme_identify_ctrl(n, req);
4754     case NVME_ID_CNS_CS_CTRL:
4755         return nvme_identify_ctrl_csi(n, req);
4756     case NVME_ID_CNS_NS_ACTIVE_LIST:
4757         return nvme_identify_nslist(n, req, true);
4758     case NVME_ID_CNS_NS_PRESENT_LIST:
4759         return nvme_identify_nslist(n, req, false);
4760     case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
4761         return nvme_identify_nslist_csi(n, req, true);
4762     case NVME_ID_CNS_CS_NS_PRESENT_LIST:
4763         return nvme_identify_nslist_csi(n, req, false);
4764     case NVME_ID_CNS_NS_DESCR_LIST:
4765         return nvme_identify_ns_descr_list(n, req);
4766     case NVME_ID_CNS_IO_COMMAND_SET:
4767         return nvme_identify_cmd_set(n, req);
4768     default:
4769         trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
4770         return NVME_INVALID_FIELD | NVME_DNR;
4771     }
4772 }
4773 
4774 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
4775 {
4776     uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
4777 
4778     req->cqe.result = 1;
4779     if (nvme_check_sqid(n, sqid)) {
4780         return NVME_INVALID_FIELD | NVME_DNR;
4781     }
4782 
4783     return NVME_SUCCESS;
4784 }
4785 
4786 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
4787 {
4788     trace_pci_nvme_setfeat_timestamp(ts);
4789 
4790     n->host_timestamp = le64_to_cpu(ts);
4791     n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4792 }
4793 
4794 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
4795 {
4796     uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4797     uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
4798 
4799     union nvme_timestamp {
4800         struct {
4801             uint64_t timestamp:48;
4802             uint64_t sync:1;
4803             uint64_t origin:3;
4804             uint64_t rsvd1:12;
4805         };
4806         uint64_t all;
4807     };
4808 
4809     union nvme_timestamp ts;
4810     ts.all = 0;
4811     ts.timestamp = n->host_timestamp + elapsed_time;
4812 
4813     /* If the host timestamp is non-zero, set the timestamp origin */
4814     ts.origin = n->host_timestamp ? 0x01 : 0x00;
4815 
4816     trace_pci_nvme_getfeat_timestamp(ts.all);
4817 
4818     return cpu_to_le64(ts.all);
4819 }
4820 
4821 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4822 {
4823     uint64_t timestamp = nvme_get_timestamp(n);
4824 
4825     return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
4826 }
4827 
4828 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
4829 {
4830     NvmeCmd *cmd = &req->cmd;
4831     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4832     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4833     uint32_t nsid = le32_to_cpu(cmd->nsid);
4834     uint32_t result;
4835     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4836     NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
4837     uint16_t iv;
4838     NvmeNamespace *ns;
4839     int i;
4840 
4841     static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
4842         [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
4843     };
4844 
4845     trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
4846 
4847     if (!nvme_feature_support[fid]) {
4848         return NVME_INVALID_FIELD | NVME_DNR;
4849     }
4850 
4851     if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4852         if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4853             /*
4854              * The Reservation Notification Mask and Reservation Persistence
4855              * features require a status code of Invalid Field in Command when
4856              * NSID is FFFFFFFFh. Since the device does not support those
4857              * features we can always return Invalid Namespace or Format as we
4858              * should do for all other features.
4859              */
4860             return NVME_INVALID_NSID | NVME_DNR;
4861         }
4862 
4863         if (!nvme_ns(n, nsid)) {
4864             return NVME_INVALID_FIELD | NVME_DNR;
4865         }
4866     }
4867 
4868     switch (sel) {
4869     case NVME_GETFEAT_SELECT_CURRENT:
4870         break;
4871     case NVME_GETFEAT_SELECT_SAVED:
4872         /* no features are saveable by the controller; fallthrough */
4873     case NVME_GETFEAT_SELECT_DEFAULT:
4874         goto defaults;
4875     case NVME_GETFEAT_SELECT_CAP:
4876         result = nvme_feature_cap[fid];
4877         goto out;
4878     }
4879 
4880     switch (fid) {
4881     case NVME_TEMPERATURE_THRESHOLD:
4882         result = 0;
4883 
4884         /*
4885          * The controller only implements the Composite Temperature sensor, so
4886          * return 0 for all other sensors.
4887          */
4888         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4889             goto out;
4890         }
4891 
4892         switch (NVME_TEMP_THSEL(dw11)) {
4893         case NVME_TEMP_THSEL_OVER:
4894             result = n->features.temp_thresh_hi;
4895             goto out;
4896         case NVME_TEMP_THSEL_UNDER:
4897             result = n->features.temp_thresh_low;
4898             goto out;
4899         }
4900 
4901         return NVME_INVALID_FIELD | NVME_DNR;
4902     case NVME_ERROR_RECOVERY:
4903         if (!nvme_nsid_valid(n, nsid)) {
4904             return NVME_INVALID_NSID | NVME_DNR;
4905         }
4906 
4907         ns = nvme_ns(n, nsid);
4908         if (unlikely(!ns)) {
4909             return NVME_INVALID_FIELD | NVME_DNR;
4910         }
4911 
4912         result = ns->features.err_rec;
4913         goto out;
4914     case NVME_VOLATILE_WRITE_CACHE:
4915         result = 0;
4916         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4917             ns = nvme_ns(n, i);
4918             if (!ns) {
4919                 continue;
4920             }
4921 
4922             result = blk_enable_write_cache(ns->blkconf.blk);
4923             if (result) {
4924                 break;
4925             }
4926         }
4927         trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
4928         goto out;
4929     case NVME_ASYNCHRONOUS_EVENT_CONF:
4930         result = n->features.async_config;
4931         goto out;
4932     case NVME_TIMESTAMP:
4933         return nvme_get_feature_timestamp(n, req);
4934     default:
4935         break;
4936     }
4937 
4938 defaults:
4939     switch (fid) {
4940     case NVME_TEMPERATURE_THRESHOLD:
4941         result = 0;
4942 
4943         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4944             break;
4945         }
4946 
4947         if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
4948             result = NVME_TEMPERATURE_WARNING;
4949         }
4950 
4951         break;
4952     case NVME_NUMBER_OF_QUEUES:
4953         result = (n->params.max_ioqpairs - 1) |
4954             ((n->params.max_ioqpairs - 1) << 16);
4955         trace_pci_nvme_getfeat_numq(result);
4956         break;
4957     case NVME_INTERRUPT_VECTOR_CONF:
4958         iv = dw11 & 0xffff;
4959         if (iv >= n->params.max_ioqpairs + 1) {
4960             return NVME_INVALID_FIELD | NVME_DNR;
4961         }
4962 
4963         result = iv;
4964         if (iv == n->admin_cq.vector) {
4965             result |= NVME_INTVC_NOCOALESCING;
4966         }
4967         break;
4968     default:
4969         result = nvme_feature_default[fid];
4970         break;
4971     }
4972 
4973 out:
4974     req->cqe.result = cpu_to_le32(result);
4975     return NVME_SUCCESS;
4976 }
4977 
4978 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4979 {
4980     uint16_t ret;
4981     uint64_t timestamp;
4982 
4983     ret = nvme_h2c(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
4984     if (ret) {
4985         return ret;
4986     }
4987 
4988     nvme_set_timestamp(n, timestamp);
4989 
4990     return NVME_SUCCESS;
4991 }
4992 
4993 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
4994 {
4995     NvmeNamespace *ns = NULL;
4996 
4997     NvmeCmd *cmd = &req->cmd;
4998     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4999     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5000     uint32_t nsid = le32_to_cpu(cmd->nsid);
5001     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
5002     uint8_t save = NVME_SETFEAT_SAVE(dw10);
5003     int i;
5004 
5005     trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
5006 
5007     if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
5008         return NVME_FID_NOT_SAVEABLE | NVME_DNR;
5009     }
5010 
5011     if (!nvme_feature_support[fid]) {
5012         return NVME_INVALID_FIELD | NVME_DNR;
5013     }
5014 
5015     if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5016         if (nsid != NVME_NSID_BROADCAST) {
5017             if (!nvme_nsid_valid(n, nsid)) {
5018                 return NVME_INVALID_NSID | NVME_DNR;
5019             }
5020 
5021             ns = nvme_ns(n, nsid);
5022             if (unlikely(!ns)) {
5023                 return NVME_INVALID_FIELD | NVME_DNR;
5024             }
5025         }
5026     } else if (nsid && nsid != NVME_NSID_BROADCAST) {
5027         if (!nvme_nsid_valid(n, nsid)) {
5028             return NVME_INVALID_NSID | NVME_DNR;
5029         }
5030 
5031         return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
5032     }
5033 
5034     if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
5035         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5036     }
5037 
5038     switch (fid) {
5039     case NVME_TEMPERATURE_THRESHOLD:
5040         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5041             break;
5042         }
5043 
5044         switch (NVME_TEMP_THSEL(dw11)) {
5045         case NVME_TEMP_THSEL_OVER:
5046             n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
5047             break;
5048         case NVME_TEMP_THSEL_UNDER:
5049             n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
5050             break;
5051         default:
5052             return NVME_INVALID_FIELD | NVME_DNR;
5053         }
5054 
5055         if ((n->temperature >= n->features.temp_thresh_hi) ||
5056             (n->temperature <= n->features.temp_thresh_low)) {
5057             nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
5058         }
5059 
5060         break;
5061     case NVME_ERROR_RECOVERY:
5062         if (nsid == NVME_NSID_BROADCAST) {
5063             for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5064                 ns = nvme_ns(n, i);
5065 
5066                 if (!ns) {
5067                     continue;
5068                 }
5069 
5070                 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5071                     ns->features.err_rec = dw11;
5072                 }
5073             }
5074 
5075             break;
5076         }
5077 
5078         assert(ns);
5079         if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat))  {
5080             ns->features.err_rec = dw11;
5081         }
5082         break;
5083     case NVME_VOLATILE_WRITE_CACHE:
5084         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5085             ns = nvme_ns(n, i);
5086             if (!ns) {
5087                 continue;
5088             }
5089 
5090             if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
5091                 blk_flush(ns->blkconf.blk);
5092             }
5093 
5094             blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
5095         }
5096 
5097         break;
5098 
5099     case NVME_NUMBER_OF_QUEUES:
5100         if (n->qs_created) {
5101             return NVME_CMD_SEQ_ERROR | NVME_DNR;
5102         }
5103 
5104         /*
5105          * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
5106          * and NSQR.
5107          */
5108         if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
5109             return NVME_INVALID_FIELD | NVME_DNR;
5110         }
5111 
5112         trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
5113                                     ((dw11 >> 16) & 0xffff) + 1,
5114                                     n->params.max_ioqpairs,
5115                                     n->params.max_ioqpairs);
5116         req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
5117                                       ((n->params.max_ioqpairs - 1) << 16));
5118         break;
5119     case NVME_ASYNCHRONOUS_EVENT_CONF:
5120         n->features.async_config = dw11;
5121         break;
5122     case NVME_TIMESTAMP:
5123         return nvme_set_feature_timestamp(n, req);
5124     case NVME_COMMAND_SET_PROFILE:
5125         if (dw11 & 0x1ff) {
5126             trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
5127             return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
5128         }
5129         break;
5130     default:
5131         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5132     }
5133     return NVME_SUCCESS;
5134 }
5135 
5136 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
5137 {
5138     trace_pci_nvme_aer(nvme_cid(req));
5139 
5140     if (n->outstanding_aers > n->params.aerl) {
5141         trace_pci_nvme_aer_aerl_exceeded();
5142         return NVME_AER_LIMIT_EXCEEDED;
5143     }
5144 
5145     n->aer_reqs[n->outstanding_aers] = req;
5146     n->outstanding_aers++;
5147 
5148     if (!QTAILQ_EMPTY(&n->aer_queue)) {
5149         nvme_process_aers(n);
5150     }
5151 
5152     return NVME_NO_COMPLETE;
5153 }
5154 
5155 static void nvme_update_dmrsl(NvmeCtrl *n)
5156 {
5157     int nsid;
5158 
5159     for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
5160         NvmeNamespace *ns = nvme_ns(n, nsid);
5161         if (!ns) {
5162             continue;
5163         }
5164 
5165         n->dmrsl = MIN_NON_ZERO(n->dmrsl,
5166                                 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
5167     }
5168 }
5169 
5170 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
5171 {
5172     uint32_t cc = ldl_le_p(&n->bar.cc);
5173 
5174     ns->iocs = nvme_cse_iocs_none;
5175     switch (ns->csi) {
5176     case NVME_CSI_NVM:
5177         if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
5178             ns->iocs = nvme_cse_iocs_nvm;
5179         }
5180         break;
5181     case NVME_CSI_ZONED:
5182         if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
5183             ns->iocs = nvme_cse_iocs_zoned;
5184         } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
5185             ns->iocs = nvme_cse_iocs_nvm;
5186         }
5187         break;
5188     }
5189 }
5190 
5191 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
5192 {
5193     NvmeNamespace *ns;
5194     NvmeCtrl *ctrl;
5195     uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5196     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5197     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5198     uint8_t sel = dw10 & 0xf;
5199     uint16_t *nr_ids = &list[0];
5200     uint16_t *ids = &list[1];
5201     uint16_t ret;
5202     int i;
5203 
5204     trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
5205 
5206     if (!nvme_nsid_valid(n, nsid)) {
5207         return NVME_INVALID_NSID | NVME_DNR;
5208     }
5209 
5210     ns = nvme_subsys_ns(n->subsys, nsid);
5211     if (!ns) {
5212         return NVME_INVALID_FIELD | NVME_DNR;
5213     }
5214 
5215     ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
5216     if (ret) {
5217         return ret;
5218     }
5219 
5220     if (!*nr_ids) {
5221         return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5222     }
5223 
5224     *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
5225     for (i = 0; i < *nr_ids; i++) {
5226         ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
5227         if (!ctrl) {
5228             return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5229         }
5230 
5231         switch (sel) {
5232         case NVME_NS_ATTACHMENT_ATTACH:
5233             if (nvme_ns(ctrl, nsid)) {
5234                 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
5235             }
5236 
5237             if (ns->attached && !ns->params.shared) {
5238                 return NVME_NS_PRIVATE | NVME_DNR;
5239             }
5240 
5241             nvme_attach_ns(ctrl, ns);
5242             nvme_select_iocs_ns(ctrl, ns);
5243 
5244             break;
5245 
5246         case NVME_NS_ATTACHMENT_DETACH:
5247             if (!nvme_ns(ctrl, nsid)) {
5248                 return NVME_NS_NOT_ATTACHED | NVME_DNR;
5249             }
5250 
5251             ctrl->namespaces[nsid] = NULL;
5252             ns->attached--;
5253 
5254             nvme_update_dmrsl(ctrl);
5255 
5256             break;
5257 
5258         default:
5259             return NVME_INVALID_FIELD | NVME_DNR;
5260         }
5261 
5262         /*
5263          * Add namespace id to the changed namespace id list for event clearing
5264          * via Get Log Page command.
5265          */
5266         if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
5267             nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
5268                                NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
5269                                NVME_LOG_CHANGED_NSLIST);
5270         }
5271     }
5272 
5273     return NVME_SUCCESS;
5274 }
5275 
5276 typedef struct NvmeFormatAIOCB {
5277     BlockAIOCB common;
5278     BlockAIOCB *aiocb;
5279     QEMUBH *bh;
5280     NvmeRequest *req;
5281     int ret;
5282 
5283     NvmeNamespace *ns;
5284     uint32_t nsid;
5285     bool broadcast;
5286     int64_t offset;
5287 } NvmeFormatAIOCB;
5288 
5289 static void nvme_format_bh(void *opaque);
5290 
5291 static void nvme_format_cancel(BlockAIOCB *aiocb)
5292 {
5293     NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
5294 
5295     if (iocb->aiocb) {
5296         blk_aio_cancel_async(iocb->aiocb);
5297     }
5298 }
5299 
5300 static const AIOCBInfo nvme_format_aiocb_info = {
5301     .aiocb_size = sizeof(NvmeFormatAIOCB),
5302     .cancel_async = nvme_format_cancel,
5303     .get_aio_context = nvme_get_aio_context,
5304 };
5305 
5306 static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd)
5307 {
5308     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5309     uint8_t lbaf = dw10 & 0xf;
5310     uint8_t pi = (dw10 >> 5) & 0x7;
5311     uint8_t mset = (dw10 >> 4) & 0x1;
5312     uint8_t pil = (dw10 >> 8) & 0x1;
5313 
5314     trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
5315 
5316     ns->id_ns.dps = (pil << 3) | pi;
5317     ns->id_ns.flbas = lbaf | (mset << 4);
5318 
5319     nvme_ns_init_format(ns);
5320 }
5321 
5322 static void nvme_format_ns_cb(void *opaque, int ret)
5323 {
5324     NvmeFormatAIOCB *iocb = opaque;
5325     NvmeRequest *req = iocb->req;
5326     NvmeNamespace *ns = iocb->ns;
5327     int bytes;
5328 
5329     if (ret < 0) {
5330         iocb->ret = ret;
5331         goto done;
5332     }
5333 
5334     assert(ns);
5335 
5336     if (iocb->offset < ns->size) {
5337         bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
5338 
5339         iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
5340                                             bytes, BDRV_REQ_MAY_UNMAP,
5341                                             nvme_format_ns_cb, iocb);
5342 
5343         iocb->offset += bytes;
5344         return;
5345     }
5346 
5347     nvme_format_set(ns, &req->cmd);
5348     ns->status = 0x0;
5349     iocb->ns = NULL;
5350     iocb->offset = 0;
5351 
5352 done:
5353     iocb->aiocb = NULL;
5354     qemu_bh_schedule(iocb->bh);
5355 }
5356 
5357 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
5358 {
5359     if (ns->params.zoned) {
5360         return NVME_INVALID_FORMAT | NVME_DNR;
5361     }
5362 
5363     if (lbaf > ns->id_ns.nlbaf) {
5364         return NVME_INVALID_FORMAT | NVME_DNR;
5365     }
5366 
5367     if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) {
5368         return NVME_INVALID_FORMAT | NVME_DNR;
5369     }
5370 
5371     if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5372         return NVME_INVALID_FIELD | NVME_DNR;
5373     }
5374 
5375     return NVME_SUCCESS;
5376 }
5377 
5378 static void nvme_format_bh(void *opaque)
5379 {
5380     NvmeFormatAIOCB *iocb = opaque;
5381     NvmeRequest *req = iocb->req;
5382     NvmeCtrl *n = nvme_ctrl(req);
5383     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5384     uint8_t lbaf = dw10 & 0xf;
5385     uint8_t pi = (dw10 >> 5) & 0x7;
5386     uint16_t status;
5387     int i;
5388 
5389     if (iocb->ret < 0) {
5390         goto done;
5391     }
5392 
5393     if (iocb->broadcast) {
5394         for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
5395             iocb->ns = nvme_ns(n, i);
5396             if (iocb->ns) {
5397                 iocb->nsid = i;
5398                 break;
5399             }
5400         }
5401     }
5402 
5403     if (!iocb->ns) {
5404         goto done;
5405     }
5406 
5407     status = nvme_format_check(iocb->ns, lbaf, pi);
5408     if (status) {
5409         req->status = status;
5410         goto done;
5411     }
5412 
5413     iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
5414     nvme_format_ns_cb(iocb, 0);
5415     return;
5416 
5417 done:
5418     qemu_bh_delete(iocb->bh);
5419     iocb->bh = NULL;
5420 
5421     iocb->common.cb(iocb->common.opaque, iocb->ret);
5422 
5423     qemu_aio_unref(iocb);
5424 }
5425 
5426 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5427 {
5428     NvmeFormatAIOCB *iocb;
5429     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5430     uint16_t status;
5431 
5432     iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
5433 
5434     iocb->req = req;
5435     iocb->bh = qemu_bh_new(nvme_format_bh, iocb);
5436     iocb->ret = 0;
5437     iocb->ns = NULL;
5438     iocb->nsid = 0;
5439     iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
5440     iocb->offset = 0;
5441 
5442     if (!iocb->broadcast) {
5443         if (!nvme_nsid_valid(n, nsid)) {
5444             status = NVME_INVALID_NSID | NVME_DNR;
5445             goto out;
5446         }
5447 
5448         iocb->ns = nvme_ns(n, nsid);
5449         if (!iocb->ns) {
5450             status = NVME_INVALID_FIELD | NVME_DNR;
5451             goto out;
5452         }
5453     }
5454 
5455     req->aiocb = &iocb->common;
5456     qemu_bh_schedule(iocb->bh);
5457 
5458     return NVME_NO_COMPLETE;
5459 
5460 out:
5461     qemu_bh_delete(iocb->bh);
5462     iocb->bh = NULL;
5463     qemu_aio_unref(iocb);
5464     return status;
5465 }
5466 
5467 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
5468 {
5469     trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
5470                              nvme_adm_opc_str(req->cmd.opcode));
5471 
5472     if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
5473         trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
5474         return NVME_INVALID_OPCODE | NVME_DNR;
5475     }
5476 
5477     /* SGLs shall not be used for Admin commands in NVMe over PCIe */
5478     if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
5479         return NVME_INVALID_FIELD | NVME_DNR;
5480     }
5481 
5482     if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
5483         return NVME_INVALID_FIELD;
5484     }
5485 
5486     switch (req->cmd.opcode) {
5487     case NVME_ADM_CMD_DELETE_SQ:
5488         return nvme_del_sq(n, req);
5489     case NVME_ADM_CMD_CREATE_SQ:
5490         return nvme_create_sq(n, req);
5491     case NVME_ADM_CMD_GET_LOG_PAGE:
5492         return nvme_get_log(n, req);
5493     case NVME_ADM_CMD_DELETE_CQ:
5494         return nvme_del_cq(n, req);
5495     case NVME_ADM_CMD_CREATE_CQ:
5496         return nvme_create_cq(n, req);
5497     case NVME_ADM_CMD_IDENTIFY:
5498         return nvme_identify(n, req);
5499     case NVME_ADM_CMD_ABORT:
5500         return nvme_abort(n, req);
5501     case NVME_ADM_CMD_SET_FEATURES:
5502         return nvme_set_feature(n, req);
5503     case NVME_ADM_CMD_GET_FEATURES:
5504         return nvme_get_feature(n, req);
5505     case NVME_ADM_CMD_ASYNC_EV_REQ:
5506         return nvme_aer(n, req);
5507     case NVME_ADM_CMD_NS_ATTACHMENT:
5508         return nvme_ns_attachment(n, req);
5509     case NVME_ADM_CMD_FORMAT_NVM:
5510         return nvme_format(n, req);
5511     default:
5512         assert(false);
5513     }
5514 
5515     return NVME_INVALID_OPCODE | NVME_DNR;
5516 }
5517 
5518 static void nvme_process_sq(void *opaque)
5519 {
5520     NvmeSQueue *sq = opaque;
5521     NvmeCtrl *n = sq->ctrl;
5522     NvmeCQueue *cq = n->cq[sq->cqid];
5523 
5524     uint16_t status;
5525     hwaddr addr;
5526     NvmeCmd cmd;
5527     NvmeRequest *req;
5528 
5529     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
5530         addr = sq->dma_addr + sq->head * n->sqe_size;
5531         if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
5532             trace_pci_nvme_err_addr_read(addr);
5533             trace_pci_nvme_err_cfs();
5534             stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
5535             break;
5536         }
5537         nvme_inc_sq_head(sq);
5538 
5539         req = QTAILQ_FIRST(&sq->req_list);
5540         QTAILQ_REMOVE(&sq->req_list, req, entry);
5541         QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
5542         nvme_req_clear(req);
5543         req->cqe.cid = cmd.cid;
5544         memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
5545 
5546         status = sq->sqid ? nvme_io_cmd(n, req) :
5547             nvme_admin_cmd(n, req);
5548         if (status != NVME_NO_COMPLETE) {
5549             req->status = status;
5550             nvme_enqueue_req_completion(cq, req);
5551         }
5552     }
5553 }
5554 
5555 static void nvme_ctrl_reset(NvmeCtrl *n)
5556 {
5557     NvmeNamespace *ns;
5558     int i;
5559 
5560     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5561         ns = nvme_ns(n, i);
5562         if (!ns) {
5563             continue;
5564         }
5565 
5566         nvme_ns_drain(ns);
5567     }
5568 
5569     for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5570         if (n->sq[i] != NULL) {
5571             nvme_free_sq(n->sq[i], n);
5572         }
5573     }
5574     for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5575         if (n->cq[i] != NULL) {
5576             nvme_free_cq(n->cq[i], n);
5577         }
5578     }
5579 
5580     while (!QTAILQ_EMPTY(&n->aer_queue)) {
5581         NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
5582         QTAILQ_REMOVE(&n->aer_queue, event, entry);
5583         g_free(event);
5584     }
5585 
5586     n->aer_queued = 0;
5587     n->outstanding_aers = 0;
5588     n->qs_created = false;
5589 }
5590 
5591 static void nvme_ctrl_shutdown(NvmeCtrl *n)
5592 {
5593     NvmeNamespace *ns;
5594     int i;
5595 
5596     if (n->pmr.dev) {
5597         memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5598     }
5599 
5600     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5601         ns = nvme_ns(n, i);
5602         if (!ns) {
5603             continue;
5604         }
5605 
5606         nvme_ns_shutdown(ns);
5607     }
5608 }
5609 
5610 static void nvme_select_iocs(NvmeCtrl *n)
5611 {
5612     NvmeNamespace *ns;
5613     int i;
5614 
5615     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5616         ns = nvme_ns(n, i);
5617         if (!ns) {
5618             continue;
5619         }
5620 
5621         nvme_select_iocs_ns(n, ns);
5622     }
5623 }
5624 
5625 static int nvme_start_ctrl(NvmeCtrl *n)
5626 {
5627     uint64_t cap = ldq_le_p(&n->bar.cap);
5628     uint32_t cc = ldl_le_p(&n->bar.cc);
5629     uint32_t aqa = ldl_le_p(&n->bar.aqa);
5630     uint64_t asq = ldq_le_p(&n->bar.asq);
5631     uint64_t acq = ldq_le_p(&n->bar.acq);
5632     uint32_t page_bits = NVME_CC_MPS(cc) + 12;
5633     uint32_t page_size = 1 << page_bits;
5634 
5635     if (unlikely(n->cq[0])) {
5636         trace_pci_nvme_err_startfail_cq();
5637         return -1;
5638     }
5639     if (unlikely(n->sq[0])) {
5640         trace_pci_nvme_err_startfail_sq();
5641         return -1;
5642     }
5643     if (unlikely(asq & (page_size - 1))) {
5644         trace_pci_nvme_err_startfail_asq_misaligned(asq);
5645         return -1;
5646     }
5647     if (unlikely(acq & (page_size - 1))) {
5648         trace_pci_nvme_err_startfail_acq_misaligned(acq);
5649         return -1;
5650     }
5651     if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
5652         trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
5653         return -1;
5654     }
5655     if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
5656         trace_pci_nvme_err_startfail_page_too_small(
5657                     NVME_CC_MPS(cc),
5658                     NVME_CAP_MPSMIN(cap));
5659         return -1;
5660     }
5661     if (unlikely(NVME_CC_MPS(cc) >
5662                  NVME_CAP_MPSMAX(cap))) {
5663         trace_pci_nvme_err_startfail_page_too_large(
5664                     NVME_CC_MPS(cc),
5665                     NVME_CAP_MPSMAX(cap));
5666         return -1;
5667     }
5668     if (unlikely(NVME_CC_IOCQES(cc) <
5669                  NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
5670         trace_pci_nvme_err_startfail_cqent_too_small(
5671                     NVME_CC_IOCQES(cc),
5672                     NVME_CTRL_CQES_MIN(cap));
5673         return -1;
5674     }
5675     if (unlikely(NVME_CC_IOCQES(cc) >
5676                  NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
5677         trace_pci_nvme_err_startfail_cqent_too_large(
5678                     NVME_CC_IOCQES(cc),
5679                     NVME_CTRL_CQES_MAX(cap));
5680         return -1;
5681     }
5682     if (unlikely(NVME_CC_IOSQES(cc) <
5683                  NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
5684         trace_pci_nvme_err_startfail_sqent_too_small(
5685                     NVME_CC_IOSQES(cc),
5686                     NVME_CTRL_SQES_MIN(cap));
5687         return -1;
5688     }
5689     if (unlikely(NVME_CC_IOSQES(cc) >
5690                  NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
5691         trace_pci_nvme_err_startfail_sqent_too_large(
5692                     NVME_CC_IOSQES(cc),
5693                     NVME_CTRL_SQES_MAX(cap));
5694         return -1;
5695     }
5696     if (unlikely(!NVME_AQA_ASQS(aqa))) {
5697         trace_pci_nvme_err_startfail_asqent_sz_zero();
5698         return -1;
5699     }
5700     if (unlikely(!NVME_AQA_ACQS(aqa))) {
5701         trace_pci_nvme_err_startfail_acqent_sz_zero();
5702         return -1;
5703     }
5704 
5705     n->page_bits = page_bits;
5706     n->page_size = page_size;
5707     n->max_prp_ents = n->page_size / sizeof(uint64_t);
5708     n->cqe_size = 1 << NVME_CC_IOCQES(cc);
5709     n->sqe_size = 1 << NVME_CC_IOSQES(cc);
5710     nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
5711     nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
5712 
5713     nvme_set_timestamp(n, 0ULL);
5714 
5715     QTAILQ_INIT(&n->aer_queue);
5716 
5717     nvme_select_iocs(n);
5718 
5719     return 0;
5720 }
5721 
5722 static void nvme_cmb_enable_regs(NvmeCtrl *n)
5723 {
5724     uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
5725     uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
5726 
5727     NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
5728     NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
5729     NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
5730     stl_le_p(&n->bar.cmbloc, cmbloc);
5731 
5732     NVME_CMBSZ_SET_SQS(cmbsz, 1);
5733     NVME_CMBSZ_SET_CQS(cmbsz, 0);
5734     NVME_CMBSZ_SET_LISTS(cmbsz, 1);
5735     NVME_CMBSZ_SET_RDS(cmbsz, 1);
5736     NVME_CMBSZ_SET_WDS(cmbsz, 1);
5737     NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
5738     NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
5739     stl_le_p(&n->bar.cmbsz, cmbsz);
5740 }
5741 
5742 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
5743                            unsigned size)
5744 {
5745     uint64_t cap = ldq_le_p(&n->bar.cap);
5746     uint32_t cc = ldl_le_p(&n->bar.cc);
5747     uint32_t intms = ldl_le_p(&n->bar.intms);
5748     uint32_t csts = ldl_le_p(&n->bar.csts);
5749     uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
5750 
5751     if (unlikely(offset & (sizeof(uint32_t) - 1))) {
5752         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
5753                        "MMIO write not 32-bit aligned,"
5754                        " offset=0x%"PRIx64"", offset);
5755         /* should be ignored, fall through for now */
5756     }
5757 
5758     if (unlikely(size < sizeof(uint32_t))) {
5759         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
5760                        "MMIO write smaller than 32-bits,"
5761                        " offset=0x%"PRIx64", size=%u",
5762                        offset, size);
5763         /* should be ignored, fall through for now */
5764     }
5765 
5766     switch (offset) {
5767     case NVME_REG_INTMS:
5768         if (unlikely(msix_enabled(&(n->parent_obj)))) {
5769             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5770                            "undefined access to interrupt mask set"
5771                            " when MSI-X is enabled");
5772             /* should be ignored, fall through for now */
5773         }
5774         intms |= data;
5775         stl_le_p(&n->bar.intms, intms);
5776         n->bar.intmc = n->bar.intms;
5777         trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
5778         nvme_irq_check(n);
5779         break;
5780     case NVME_REG_INTMC:
5781         if (unlikely(msix_enabled(&(n->parent_obj)))) {
5782             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5783                            "undefined access to interrupt mask clr"
5784                            " when MSI-X is enabled");
5785             /* should be ignored, fall through for now */
5786         }
5787         intms &= ~data;
5788         stl_le_p(&n->bar.intms, intms);
5789         n->bar.intmc = n->bar.intms;
5790         trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
5791         nvme_irq_check(n);
5792         break;
5793     case NVME_REG_CC:
5794         trace_pci_nvme_mmio_cfg(data & 0xffffffff);
5795 
5796         /* Windows first sends data, then sends enable bit */
5797         if (!NVME_CC_EN(data) && !NVME_CC_EN(cc) &&
5798             !NVME_CC_SHN(data) && !NVME_CC_SHN(cc))
5799         {
5800             cc = data;
5801         }
5802 
5803         if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
5804             cc = data;
5805 
5806             /* flush CC since nvme_start_ctrl() needs the value */
5807             stl_le_p(&n->bar.cc, cc);
5808             if (unlikely(nvme_start_ctrl(n))) {
5809                 trace_pci_nvme_err_startfail();
5810                 csts = NVME_CSTS_FAILED;
5811             } else {
5812                 trace_pci_nvme_mmio_start_success();
5813                 csts = NVME_CSTS_READY;
5814             }
5815         } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
5816             trace_pci_nvme_mmio_stopped();
5817             nvme_ctrl_reset(n);
5818             cc = 0;
5819             csts &= ~NVME_CSTS_READY;
5820         }
5821 
5822         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
5823             trace_pci_nvme_mmio_shutdown_set();
5824             nvme_ctrl_shutdown(n);
5825             cc = data;
5826             csts |= NVME_CSTS_SHST_COMPLETE;
5827         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
5828             trace_pci_nvme_mmio_shutdown_cleared();
5829             csts &= ~NVME_CSTS_SHST_COMPLETE;
5830             cc = data;
5831         }
5832 
5833         stl_le_p(&n->bar.cc, cc);
5834         stl_le_p(&n->bar.csts, csts);
5835 
5836         break;
5837     case NVME_REG_CSTS:
5838         if (data & (1 << 4)) {
5839             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
5840                            "attempted to W1C CSTS.NSSRO"
5841                            " but CAP.NSSRS is zero (not supported)");
5842         } else if (data != 0) {
5843             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
5844                            "attempted to set a read only bit"
5845                            " of controller status");
5846         }
5847         break;
5848     case NVME_REG_NSSR:
5849         if (data == 0x4e564d65) {
5850             trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
5851         } else {
5852             /* The spec says that writes of other values have no effect */
5853             return;
5854         }
5855         break;
5856     case NVME_REG_AQA:
5857         stl_le_p(&n->bar.aqa, data);
5858         trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
5859         break;
5860     case NVME_REG_ASQ:
5861         stn_le_p(&n->bar.asq, size, data);
5862         trace_pci_nvme_mmio_asqaddr(data);
5863         break;
5864     case NVME_REG_ASQ + 4:
5865         stl_le_p((uint8_t *)&n->bar.asq + 4, data);
5866         trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
5867         break;
5868     case NVME_REG_ACQ:
5869         trace_pci_nvme_mmio_acqaddr(data);
5870         stn_le_p(&n->bar.acq, size, data);
5871         break;
5872     case NVME_REG_ACQ + 4:
5873         stl_le_p((uint8_t *)&n->bar.acq + 4, data);
5874         trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
5875         break;
5876     case NVME_REG_CMBLOC:
5877         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
5878                        "invalid write to reserved CMBLOC"
5879                        " when CMBSZ is zero, ignored");
5880         return;
5881     case NVME_REG_CMBSZ:
5882         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
5883                        "invalid write to read only CMBSZ, ignored");
5884         return;
5885     case NVME_REG_CMBMSC:
5886         if (!NVME_CAP_CMBS(cap)) {
5887             return;
5888         }
5889 
5890         stn_le_p(&n->bar.cmbmsc, size, data);
5891         n->cmb.cmse = false;
5892 
5893         if (NVME_CMBMSC_CRE(data)) {
5894             nvme_cmb_enable_regs(n);
5895 
5896             if (NVME_CMBMSC_CMSE(data)) {
5897                 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
5898                 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
5899                 if (cba + int128_get64(n->cmb.mem.size) < cba) {
5900                     uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
5901                     NVME_CMBSTS_SET_CBAI(cmbsts, 1);
5902                     stl_le_p(&n->bar.cmbsts, cmbsts);
5903                     return;
5904                 }
5905 
5906                 n->cmb.cba = cba;
5907                 n->cmb.cmse = true;
5908             }
5909         } else {
5910             n->bar.cmbsz = 0;
5911             n->bar.cmbloc = 0;
5912         }
5913 
5914         return;
5915     case NVME_REG_CMBMSC + 4:
5916         stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
5917         return;
5918 
5919     case NVME_REG_PMRCAP:
5920         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
5921                        "invalid write to PMRCAP register, ignored");
5922         return;
5923     case NVME_REG_PMRCTL:
5924         if (!NVME_CAP_PMRS(cap)) {
5925             return;
5926         }
5927 
5928         stl_le_p(&n->bar.pmrctl, data);
5929         if (NVME_PMRCTL_EN(data)) {
5930             memory_region_set_enabled(&n->pmr.dev->mr, true);
5931             pmrsts = 0;
5932         } else {
5933             memory_region_set_enabled(&n->pmr.dev->mr, false);
5934             NVME_PMRSTS_SET_NRDY(pmrsts, 1);
5935             n->pmr.cmse = false;
5936         }
5937         stl_le_p(&n->bar.pmrsts, pmrsts);
5938         return;
5939     case NVME_REG_PMRSTS:
5940         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
5941                        "invalid write to PMRSTS register, ignored");
5942         return;
5943     case NVME_REG_PMREBS:
5944         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
5945                        "invalid write to PMREBS register, ignored");
5946         return;
5947     case NVME_REG_PMRSWTP:
5948         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
5949                        "invalid write to PMRSWTP register, ignored");
5950         return;
5951     case NVME_REG_PMRMSCL:
5952         if (!NVME_CAP_PMRS(cap)) {
5953             return;
5954         }
5955 
5956         stl_le_p(&n->bar.pmrmscl, data);
5957         n->pmr.cmse = false;
5958 
5959         if (NVME_PMRMSCL_CMSE(data)) {
5960             uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
5961             hwaddr cba = pmrmscu << 32 |
5962                 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
5963             if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
5964                 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
5965                 stl_le_p(&n->bar.pmrsts, pmrsts);
5966                 return;
5967             }
5968 
5969             n->pmr.cmse = true;
5970             n->pmr.cba = cba;
5971         }
5972 
5973         return;
5974     case NVME_REG_PMRMSCU:
5975         if (!NVME_CAP_PMRS(cap)) {
5976             return;
5977         }
5978 
5979         stl_le_p(&n->bar.pmrmscu, data);
5980         return;
5981     default:
5982         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
5983                        "invalid MMIO write,"
5984                        " offset=0x%"PRIx64", data=%"PRIx64"",
5985                        offset, data);
5986         break;
5987     }
5988 }
5989 
5990 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
5991 {
5992     NvmeCtrl *n = (NvmeCtrl *)opaque;
5993     uint8_t *ptr = (uint8_t *)&n->bar;
5994 
5995     trace_pci_nvme_mmio_read(addr, size);
5996 
5997     if (unlikely(addr & (sizeof(uint32_t) - 1))) {
5998         NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
5999                        "MMIO read not 32-bit aligned,"
6000                        " offset=0x%"PRIx64"", addr);
6001         /* should RAZ, fall through for now */
6002     } else if (unlikely(size < sizeof(uint32_t))) {
6003         NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
6004                        "MMIO read smaller than 32-bits,"
6005                        " offset=0x%"PRIx64"", addr);
6006         /* should RAZ, fall through for now */
6007     }
6008 
6009     if (addr > sizeof(n->bar) - size) {
6010         NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
6011                        "MMIO read beyond last register,"
6012                        " offset=0x%"PRIx64", returning 0", addr);
6013 
6014         return 0;
6015     }
6016 
6017     /*
6018      * When PMRWBM bit 1 is set then read from
6019      * from PMRSTS should ensure prior writes
6020      * made it to persistent media
6021      */
6022     if (addr == NVME_REG_PMRSTS &&
6023         (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
6024         memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
6025     }
6026 
6027     return ldn_le_p(ptr + addr, size);
6028 }
6029 
6030 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
6031 {
6032     uint32_t qid;
6033 
6034     if (unlikely(addr & ((1 << 2) - 1))) {
6035         NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
6036                        "doorbell write not 32-bit aligned,"
6037                        " offset=0x%"PRIx64", ignoring", addr);
6038         return;
6039     }
6040 
6041     if (((addr - 0x1000) >> 2) & 1) {
6042         /* Completion queue doorbell write */
6043 
6044         uint16_t new_head = val & 0xffff;
6045         int start_sqs;
6046         NvmeCQueue *cq;
6047 
6048         qid = (addr - (0x1000 + (1 << 2))) >> 3;
6049         if (unlikely(nvme_check_cqid(n, qid))) {
6050             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
6051                            "completion queue doorbell write"
6052                            " for nonexistent queue,"
6053                            " sqid=%"PRIu32", ignoring", qid);
6054 
6055             /*
6056              * NVM Express v1.3d, Section 4.1 state: "If host software writes
6057              * an invalid value to the Submission Queue Tail Doorbell or
6058              * Completion Queue Head Doorbell regiter and an Asynchronous Event
6059              * Request command is outstanding, then an asynchronous event is
6060              * posted to the Admin Completion Queue with a status code of
6061              * Invalid Doorbell Write Value."
6062              *
6063              * Also note that the spec includes the "Invalid Doorbell Register"
6064              * status code, but nowhere does it specify when to use it.
6065              * However, it seems reasonable to use it here in a similar
6066              * fashion.
6067              */
6068             if (n->outstanding_aers) {
6069                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6070                                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6071                                    NVME_LOG_ERROR_INFO);
6072             }
6073 
6074             return;
6075         }
6076 
6077         cq = n->cq[qid];
6078         if (unlikely(new_head >= cq->size)) {
6079             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
6080                            "completion queue doorbell write value"
6081                            " beyond queue size, sqid=%"PRIu32","
6082                            " new_head=%"PRIu16", ignoring",
6083                            qid, new_head);
6084 
6085             if (n->outstanding_aers) {
6086                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6087                                    NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6088                                    NVME_LOG_ERROR_INFO);
6089             }
6090 
6091             return;
6092         }
6093 
6094         trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
6095 
6096         start_sqs = nvme_cq_full(cq) ? 1 : 0;
6097         cq->head = new_head;
6098         if (start_sqs) {
6099             NvmeSQueue *sq;
6100             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
6101                 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6102             }
6103             timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6104         }
6105 
6106         if (cq->tail == cq->head) {
6107             if (cq->irq_enabled) {
6108                 n->cq_pending--;
6109             }
6110 
6111             nvme_irq_deassert(n, cq);
6112         }
6113     } else {
6114         /* Submission queue doorbell write */
6115 
6116         uint16_t new_tail = val & 0xffff;
6117         NvmeSQueue *sq;
6118 
6119         qid = (addr - 0x1000) >> 3;
6120         if (unlikely(nvme_check_sqid(n, qid))) {
6121             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
6122                            "submission queue doorbell write"
6123                            " for nonexistent queue,"
6124                            " sqid=%"PRIu32", ignoring", qid);
6125 
6126             if (n->outstanding_aers) {
6127                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6128                                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6129                                    NVME_LOG_ERROR_INFO);
6130             }
6131 
6132             return;
6133         }
6134 
6135         sq = n->sq[qid];
6136         if (unlikely(new_tail >= sq->size)) {
6137             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
6138                            "submission queue doorbell write value"
6139                            " beyond queue size, sqid=%"PRIu32","
6140                            " new_tail=%"PRIu16", ignoring",
6141                            qid, new_tail);
6142 
6143             if (n->outstanding_aers) {
6144                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6145                                    NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6146                                    NVME_LOG_ERROR_INFO);
6147             }
6148 
6149             return;
6150         }
6151 
6152         trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
6153 
6154         sq->tail = new_tail;
6155         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6156     }
6157 }
6158 
6159 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
6160                             unsigned size)
6161 {
6162     NvmeCtrl *n = (NvmeCtrl *)opaque;
6163 
6164     trace_pci_nvme_mmio_write(addr, data, size);
6165 
6166     if (addr < sizeof(n->bar)) {
6167         nvme_write_bar(n, addr, data, size);
6168     } else {
6169         nvme_process_db(n, addr, data);
6170     }
6171 }
6172 
6173 static const MemoryRegionOps nvme_mmio_ops = {
6174     .read = nvme_mmio_read,
6175     .write = nvme_mmio_write,
6176     .endianness = DEVICE_LITTLE_ENDIAN,
6177     .impl = {
6178         .min_access_size = 2,
6179         .max_access_size = 8,
6180     },
6181 };
6182 
6183 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
6184                            unsigned size)
6185 {
6186     NvmeCtrl *n = (NvmeCtrl *)opaque;
6187     stn_le_p(&n->cmb.buf[addr], size, data);
6188 }
6189 
6190 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
6191 {
6192     NvmeCtrl *n = (NvmeCtrl *)opaque;
6193     return ldn_le_p(&n->cmb.buf[addr], size);
6194 }
6195 
6196 static const MemoryRegionOps nvme_cmb_ops = {
6197     .read = nvme_cmb_read,
6198     .write = nvme_cmb_write,
6199     .endianness = DEVICE_LITTLE_ENDIAN,
6200     .impl = {
6201         .min_access_size = 1,
6202         .max_access_size = 8,
6203     },
6204 };
6205 
6206 static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
6207 {
6208     NvmeParams *params = &n->params;
6209 
6210     if (params->num_queues) {
6211         warn_report("num_queues is deprecated; please use max_ioqpairs "
6212                     "instead");
6213 
6214         params->max_ioqpairs = params->num_queues - 1;
6215     }
6216 
6217     if (n->namespace.blkconf.blk && n->subsys) {
6218         error_setg(errp, "subsystem support is unavailable with legacy "
6219                    "namespace ('drive' property)");
6220         return;
6221     }
6222 
6223     if (params->max_ioqpairs < 1 ||
6224         params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
6225         error_setg(errp, "max_ioqpairs must be between 1 and %d",
6226                    NVME_MAX_IOQPAIRS);
6227         return;
6228     }
6229 
6230     if (params->msix_qsize < 1 ||
6231         params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
6232         error_setg(errp, "msix_qsize must be between 1 and %d",
6233                    PCI_MSIX_FLAGS_QSIZE + 1);
6234         return;
6235     }
6236 
6237     if (!params->serial) {
6238         error_setg(errp, "serial property not set");
6239         return;
6240     }
6241 
6242     if (n->pmr.dev) {
6243         if (host_memory_backend_is_mapped(n->pmr.dev)) {
6244             error_setg(errp, "can't use already busy memdev: %s",
6245                        object_get_canonical_path_component(OBJECT(n->pmr.dev)));
6246             return;
6247         }
6248 
6249         if (!is_power_of_2(n->pmr.dev->size)) {
6250             error_setg(errp, "pmr backend size needs to be power of 2 in size");
6251             return;
6252         }
6253 
6254         host_memory_backend_set_mapped(n->pmr.dev, true);
6255     }
6256 
6257     if (n->params.zasl > n->params.mdts) {
6258         error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
6259                    "than or equal to mdts (Maximum Data Transfer Size)");
6260         return;
6261     }
6262 
6263     if (!n->params.vsl) {
6264         error_setg(errp, "vsl must be non-zero");
6265         return;
6266     }
6267 }
6268 
6269 static void nvme_init_state(NvmeCtrl *n)
6270 {
6271     /* add one to max_ioqpairs to account for the admin queue pair */
6272     n->reg_size = pow2ceil(sizeof(NvmeBar) +
6273                            2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
6274     n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
6275     n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
6276     n->temperature = NVME_TEMPERATURE;
6277     n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
6278     n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6279     n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
6280 }
6281 
6282 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
6283 {
6284     uint64_t cmb_size = n->params.cmb_size_mb * MiB;
6285     uint64_t cap = ldq_le_p(&n->bar.cap);
6286 
6287     n->cmb.buf = g_malloc0(cmb_size);
6288     memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
6289                           "nvme-cmb", cmb_size);
6290     pci_register_bar(pci_dev, NVME_CMB_BIR,
6291                      PCI_BASE_ADDRESS_SPACE_MEMORY |
6292                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
6293                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
6294 
6295     NVME_CAP_SET_CMBS(cap, 1);
6296     stq_le_p(&n->bar.cap, cap);
6297 
6298     if (n->params.legacy_cmb) {
6299         nvme_cmb_enable_regs(n);
6300         n->cmb.cmse = true;
6301     }
6302 }
6303 
6304 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
6305 {
6306     uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
6307 
6308     NVME_PMRCAP_SET_RDS(pmrcap, 1);
6309     NVME_PMRCAP_SET_WDS(pmrcap, 1);
6310     NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
6311     /* Turn on bit 1 support */
6312     NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
6313     NVME_PMRCAP_SET_CMSS(pmrcap, 1);
6314     stl_le_p(&n->bar.pmrcap, pmrcap);
6315 
6316     pci_register_bar(pci_dev, NVME_PMR_BIR,
6317                      PCI_BASE_ADDRESS_SPACE_MEMORY |
6318                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
6319                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
6320 
6321     memory_region_set_enabled(&n->pmr.dev->mr, false);
6322 }
6323 
6324 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
6325 {
6326     uint8_t *pci_conf = pci_dev->config;
6327     uint64_t bar_size, msix_table_size, msix_pba_size;
6328     unsigned msix_table_offset, msix_pba_offset;
6329     int ret;
6330 
6331     Error *err = NULL;
6332 
6333     pci_conf[PCI_INTERRUPT_PIN] = 1;
6334     pci_config_set_prog_interface(pci_conf, 0x2);
6335 
6336     if (n->params.use_intel_id) {
6337         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
6338         pci_config_set_device_id(pci_conf, 0x5845);
6339     } else {
6340         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
6341         pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
6342     }
6343 
6344     pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
6345     pcie_endpoint_cap_init(pci_dev, 0x80);
6346 
6347     bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
6348     msix_table_offset = bar_size;
6349     msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
6350 
6351     bar_size += msix_table_size;
6352     bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
6353     msix_pba_offset = bar_size;
6354     msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
6355 
6356     bar_size += msix_pba_size;
6357     bar_size = pow2ceil(bar_size);
6358 
6359     memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
6360     memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
6361                           n->reg_size);
6362     memory_region_add_subregion(&n->bar0, 0, &n->iomem);
6363 
6364     pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
6365                      PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
6366     ret = msix_init(pci_dev, n->params.msix_qsize,
6367                     &n->bar0, 0, msix_table_offset,
6368                     &n->bar0, 0, msix_pba_offset, 0, &err);
6369     if (ret < 0) {
6370         if (ret == -ENOTSUP) {
6371             warn_report_err(err);
6372         } else {
6373             error_propagate(errp, err);
6374             return ret;
6375         }
6376     }
6377 
6378     if (n->params.cmb_size_mb) {
6379         nvme_init_cmb(n, pci_dev);
6380     }
6381 
6382     if (n->pmr.dev) {
6383         nvme_init_pmr(n, pci_dev);
6384     }
6385 
6386     return 0;
6387 }
6388 
6389 static void nvme_init_subnqn(NvmeCtrl *n)
6390 {
6391     NvmeSubsystem *subsys = n->subsys;
6392     NvmeIdCtrl *id = &n->id_ctrl;
6393 
6394     if (!subsys) {
6395         snprintf((char *)id->subnqn, sizeof(id->subnqn),
6396                  "nqn.2019-08.org.qemu:%s", n->params.serial);
6397     } else {
6398         pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
6399     }
6400 }
6401 
6402 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
6403 {
6404     NvmeIdCtrl *id = &n->id_ctrl;
6405     uint8_t *pci_conf = pci_dev->config;
6406     uint64_t cap = ldq_le_p(&n->bar.cap);
6407 
6408     id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
6409     id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
6410     strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
6411     strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
6412     strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
6413 
6414     id->cntlid = cpu_to_le16(n->cntlid);
6415 
6416     id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
6417 
6418     id->rab = 6;
6419 
6420     if (n->params.use_intel_id) {
6421         id->ieee[0] = 0xb3;
6422         id->ieee[1] = 0x02;
6423         id->ieee[2] = 0x00;
6424     } else {
6425         id->ieee[0] = 0x00;
6426         id->ieee[1] = 0x54;
6427         id->ieee[2] = 0x52;
6428     }
6429 
6430     id->mdts = n->params.mdts;
6431     id->ver = cpu_to_le32(NVME_SPEC_VER);
6432     id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
6433     id->cntrltype = 0x1;
6434 
6435     /*
6436      * Because the controller always completes the Abort command immediately,
6437      * there can never be more than one concurrently executing Abort command,
6438      * so this value is never used for anything. Note that there can easily be
6439      * many Abort commands in the queues, but they are not considered
6440      * "executing" until processed by nvme_abort.
6441      *
6442      * The specification recommends a value of 3 for Abort Command Limit (four
6443      * concurrently outstanding Abort commands), so lets use that though it is
6444      * inconsequential.
6445      */
6446     id->acl = 3;
6447     id->aerl = n->params.aerl;
6448     id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
6449     id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
6450 
6451     /* recommended default value (~70 C) */
6452     id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
6453     id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
6454 
6455     id->sqes = (0x6 << 4) | 0x6;
6456     id->cqes = (0x4 << 4) | 0x4;
6457     id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
6458     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
6459                            NVME_ONCS_FEATURES | NVME_ONCS_DSM |
6460                            NVME_ONCS_COMPARE | NVME_ONCS_COPY);
6461 
6462     /*
6463      * NOTE: If this device ever supports a command set that does NOT use 0x0
6464      * as a Flush-equivalent operation, support for the broadcast NSID in Flush
6465      * should probably be removed.
6466      *
6467      * See comment in nvme_io_cmd.
6468      */
6469     id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
6470 
6471     id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
6472     id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
6473                            NVME_CTRL_SGLS_BITBUCKET);
6474 
6475     nvme_init_subnqn(n);
6476 
6477     id->psd[0].mp = cpu_to_le16(0x9c4);
6478     id->psd[0].enlat = cpu_to_le32(0x10);
6479     id->psd[0].exlat = cpu_to_le32(0x4);
6480 
6481     if (n->subsys) {
6482         id->cmic |= NVME_CMIC_MULTI_CTRL;
6483     }
6484 
6485     NVME_CAP_SET_MQES(cap, 0x7ff);
6486     NVME_CAP_SET_CQR(cap, 1);
6487     NVME_CAP_SET_TO(cap, 0xf);
6488     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
6489     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
6490     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
6491     NVME_CAP_SET_MPSMAX(cap, 4);
6492     NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
6493     NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
6494     stq_le_p(&n->bar.cap, cap);
6495 
6496     stl_le_p(&n->bar.vs, NVME_SPEC_VER);
6497     n->bar.intmc = n->bar.intms = 0;
6498 }
6499 
6500 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
6501 {
6502     int cntlid;
6503 
6504     if (!n->subsys) {
6505         return 0;
6506     }
6507 
6508     cntlid = nvme_subsys_register_ctrl(n, errp);
6509     if (cntlid < 0) {
6510         return -1;
6511     }
6512 
6513     n->cntlid = cntlid;
6514 
6515     return 0;
6516 }
6517 
6518 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6519 {
6520     uint32_t nsid = ns->params.nsid;
6521     assert(nsid && nsid <= NVME_MAX_NAMESPACES);
6522 
6523     n->namespaces[nsid] = ns;
6524     ns->attached++;
6525 
6526     n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6527                             BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6528 }
6529 
6530 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
6531 {
6532     NvmeCtrl *n = NVME(pci_dev);
6533     NvmeNamespace *ns;
6534     Error *local_err = NULL;
6535 
6536     nvme_check_constraints(n, &local_err);
6537     if (local_err) {
6538         error_propagate(errp, local_err);
6539         return;
6540     }
6541 
6542     qbus_create_inplace(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
6543                         &pci_dev->qdev, n->parent_obj.qdev.id);
6544 
6545     nvme_init_state(n);
6546     if (nvme_init_pci(n, pci_dev, errp)) {
6547         return;
6548     }
6549 
6550     if (nvme_init_subsys(n, errp)) {
6551         error_propagate(errp, local_err);
6552         return;
6553     }
6554     nvme_init_ctrl(n, pci_dev);
6555 
6556     /* setup a namespace if the controller drive property was given */
6557     if (n->namespace.blkconf.blk) {
6558         ns = &n->namespace;
6559         ns->params.nsid = 1;
6560 
6561         if (nvme_ns_setup(ns, errp)) {
6562             return;
6563         }
6564 
6565         nvme_attach_ns(n, ns);
6566     }
6567 }
6568 
6569 static void nvme_exit(PCIDevice *pci_dev)
6570 {
6571     NvmeCtrl *n = NVME(pci_dev);
6572     NvmeNamespace *ns;
6573     int i;
6574 
6575     nvme_ctrl_reset(n);
6576 
6577     if (n->subsys) {
6578         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6579             ns = nvme_ns(n, i);
6580             if (ns) {
6581                 ns->attached--;
6582             }
6583         }
6584 
6585         nvme_subsys_unregister_ctrl(n->subsys, n);
6586     }
6587 
6588     g_free(n->cq);
6589     g_free(n->sq);
6590     g_free(n->aer_reqs);
6591 
6592     if (n->params.cmb_size_mb) {
6593         g_free(n->cmb.buf);
6594     }
6595 
6596     if (n->pmr.dev) {
6597         host_memory_backend_set_mapped(n->pmr.dev, false);
6598     }
6599     msix_uninit(pci_dev, &n->bar0, &n->bar0);
6600     memory_region_del_subregion(&n->bar0, &n->iomem);
6601 }
6602 
6603 static Property nvme_props[] = {
6604     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
6605     DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
6606                      HostMemoryBackend *),
6607     DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
6608                      NvmeSubsystem *),
6609     DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
6610     DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
6611     DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
6612     DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6613     DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
6614     DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
6615     DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
6616     DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
6617     DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
6618     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
6619     DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
6620     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
6621     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
6622                      params.auto_transition_zones, true),
6623     DEFINE_PROP_END_OF_LIST(),
6624 };
6625 
6626 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
6627                                    void *opaque, Error **errp)
6628 {
6629     NvmeCtrl *n = NVME(obj);
6630     uint8_t value = n->smart_critical_warning;
6631 
6632     visit_type_uint8(v, name, &value, errp);
6633 }
6634 
6635 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
6636                                    void *opaque, Error **errp)
6637 {
6638     NvmeCtrl *n = NVME(obj);
6639     uint8_t value, old_value, cap = 0, index, event;
6640 
6641     if (!visit_type_uint8(v, name, &value, errp)) {
6642         return;
6643     }
6644 
6645     cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
6646           | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
6647     if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
6648         cap |= NVME_SMART_PMR_UNRELIABLE;
6649     }
6650 
6651     if ((value & cap) != value) {
6652         error_setg(errp, "unsupported smart critical warning bits: 0x%x",
6653                    value & ~cap);
6654         return;
6655     }
6656 
6657     old_value = n->smart_critical_warning;
6658     n->smart_critical_warning = value;
6659 
6660     /* only inject new bits of smart critical warning */
6661     for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
6662         event = 1 << index;
6663         if (value & ~old_value & event)
6664             nvme_smart_event(n, event);
6665     }
6666 }
6667 
6668 static const VMStateDescription nvme_vmstate = {
6669     .name = "nvme",
6670     .unmigratable = 1,
6671 };
6672 
6673 static void nvme_class_init(ObjectClass *oc, void *data)
6674 {
6675     DeviceClass *dc = DEVICE_CLASS(oc);
6676     PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
6677 
6678     pc->realize = nvme_realize;
6679     pc->exit = nvme_exit;
6680     pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
6681     pc->revision = 2;
6682 
6683     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
6684     dc->desc = "Non-Volatile Memory Express";
6685     device_class_set_props(dc, nvme_props);
6686     dc->vmsd = &nvme_vmstate;
6687 }
6688 
6689 static void nvme_instance_init(Object *obj)
6690 {
6691     NvmeCtrl *n = NVME(obj);
6692 
6693     device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
6694                                   "bootindex", "/namespace@1,0",
6695                                   DEVICE(obj));
6696 
6697     object_property_add(obj, "smart_critical_warning", "uint8",
6698                         nvme_get_smart_warning,
6699                         nvme_set_smart_warning, NULL, NULL);
6700 }
6701 
6702 static const TypeInfo nvme_info = {
6703     .name          = TYPE_NVME,
6704     .parent        = TYPE_PCI_DEVICE,
6705     .instance_size = sizeof(NvmeCtrl),
6706     .instance_init = nvme_instance_init,
6707     .class_init    = nvme_class_init,
6708     .interfaces = (InterfaceInfo[]) {
6709         { INTERFACE_PCIE_DEVICE },
6710         { }
6711     },
6712 };
6713 
6714 static const TypeInfo nvme_bus_info = {
6715     .name = TYPE_NVME_BUS,
6716     .parent = TYPE_BUS,
6717     .instance_size = sizeof(NvmeBus),
6718 };
6719 
6720 static void nvme_register_types(void)
6721 {
6722     type_register_static(&nvme_info);
6723     type_register_static(&nvme_bus_info);
6724 }
6725 
6726 type_init(nvme_register_types)
6727