xref: /openbmc/qemu/hw/nvme/ctrl.c (revision 62fffaa6)
1 /*
2  * QEMU NVM Express Controller
3  *
4  * Copyright (c) 2012, Intel Corporation
5  *
6  * Written by Keith Busch <keith.busch@intel.com>
7  *
8  * This code is licensed under the GNU GPL v2 or later.
9  */
10 
11 /**
12  * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13  *
14  *  https://nvmexpress.org/developers/nvme-specification/
15  *
16  *
17  * Notes on coding style
18  * ---------------------
19  * While QEMU coding style prefers lowercase hexadecimals in constants, the
20  * NVMe subsystem use thes format from the NVMe specifications in the comments
21  * (i.e. 'h' suffix instead of '0x' prefix).
22  *
23  * Usage
24  * -----
25  * See docs/system/nvme.rst for extensive documentation.
26  *
27  * Add options:
28  *      -drive file=<file>,if=none,id=<drive_id>
29  *      -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30  *      -device nvme,serial=<serial>,id=<bus_name>, \
31  *              cmb_size_mb=<cmb_size_mb[optional]>, \
32  *              [pmrdev=<mem_backend_file_id>,] \
33  *              max_ioqpairs=<N[optional]>, \
34  *              aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35  *              mdts=<N[optional]>,vsl=<N[optional]>, \
36  *              zoned.zasl=<N[optional]>, \
37  *              zoned.auto_transition=<on|off[optional]>, \
38  *              subsys=<subsys_id>
39  *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
40  *              zoned=<true|false[optional]>, \
41  *              subsys=<subsys_id>,detached=<true|false[optional]>
42  *
43  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
44  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
45  * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
46  * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
47  *
48  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
49  * For example:
50  * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
51  *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
52  *
53  * The PMR will use BAR 4/5 exclusively.
54  *
55  * To place controller(s) and namespace(s) to a subsystem, then provide
56  * nvme-subsys device as above.
57  *
58  * nvme subsystem device parameters
59  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
60  * - `nqn`
61  *   This parameter provides the `<nqn_id>` part of the string
62  *   `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
63  *   of subsystem controllers. Note that `<nqn_id>` should be unique per
64  *   subsystem, but this is not enforced by QEMU. If not specified, it will
65  *   default to the value of the `id` parameter (`<subsys_id>`).
66  *
67  * nvme device parameters
68  * ~~~~~~~~~~~~~~~~~~~~~~
69  * - `subsys`
70  *   Specifying this parameter attaches the controller to the subsystem and
71  *   the SUBNQN field in the controller will report the NQN of the subsystem
72  *   device. This also enables multi controller capability represented in
73  *   Identify Controller data structure in CMIC (Controller Multi-path I/O and
74  *   Namesapce Sharing Capabilities).
75  *
76  * - `aerl`
77  *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
78  *   of concurrently outstanding Asynchronous Event Request commands support
79  *   by the controller. This is a 0's based value.
80  *
81  * - `aer_max_queued`
82  *   This is the maximum number of events that the device will enqueue for
83  *   completion when there are no outstanding AERs. When the maximum number of
84  *   enqueued events are reached, subsequent events will be dropped.
85  *
86  * - `mdts`
87  *   Indicates the maximum data transfer size for a command that transfers data
88  *   between host-accessible memory and the controller. The value is specified
89  *   as a power of two (2^n) and is in units of the minimum memory page size
90  *   (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
91  *
92  * - `vsl`
93  *   Indicates the maximum data size limit for the Verify command. Like `mdts`,
94  *   this value is specified as a power of two (2^n) and is in units of the
95  *   minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
96  *   KiB).
97  *
98  * - `zoned.zasl`
99  *   Indicates the maximum data transfer size for the Zone Append command. Like
100  *   `mdts`, the value is specified as a power of two (2^n) and is in units of
101  *   the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
102  *   defaulting to the value of `mdts`).
103  *
104  * - `zoned.auto_transition`
105  *   Indicates if zones in zone state implicitly opened can be automatically
106  *   transitioned to zone state closed for resource management purposes.
107  *   Defaults to 'on'.
108  *
109  * nvme namespace device parameters
110  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
111  * - `shared`
112  *   When the parent nvme device (as defined explicitly by the 'bus' parameter
113  *   or implicitly by the most recently defined NvmeBus) is linked to an
114  *   nvme-subsys device, the namespace will be attached to all controllers in
115  *   the subsystem. If set to 'off' (the default), the namespace will remain a
116  *   private namespace and may only be attached to a single controller at a
117  *   time.
118  *
119  * - `detached`
120  *   This parameter is only valid together with the `subsys` parameter. If left
121  *   at the default value (`false/off`), the namespace will be attached to all
122  *   controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
123  *   namespace will be available in the subsystem but not attached to any
124  *   controllers.
125  *
126  * Setting `zoned` to true selects Zoned Command Set at the namespace.
127  * In this case, the following namespace properties are available to configure
128  * zoned operation:
129  *     zoned.zone_size=<zone size in bytes, default: 128MiB>
130  *         The number may be followed by K, M, G as in kilo-, mega- or giga-.
131  *
132  *     zoned.zone_capacity=<zone capacity in bytes, default: zone size>
133  *         The value 0 (default) forces zone capacity to be the same as zone
134  *         size. The value of this property may not exceed zone size.
135  *
136  *     zoned.descr_ext_size=<zone descriptor extension size, default 0>
137  *         This value needs to be specified in 64B units. If it is zero,
138  *         namespace(s) will not support zone descriptor extensions.
139  *
140  *     zoned.max_active=<Maximum Active Resources (zones), default: 0>
141  *         The default value means there is no limit to the number of
142  *         concurrently active zones.
143  *
144  *     zoned.max_open=<Maximum Open Resources (zones), default: 0>
145  *         The default value means there is no limit to the number of
146  *         concurrently open zones.
147  *
148  *     zoned.cross_read=<enable RAZB, default: false>
149  *         Setting this property to true enables Read Across Zone Boundaries.
150  */
151 
152 #include "qemu/osdep.h"
153 #include "qemu/cutils.h"
154 #include "qemu/error-report.h"
155 #include "qemu/log.h"
156 #include "qemu/units.h"
157 #include "qapi/error.h"
158 #include "qapi/visitor.h"
159 #include "sysemu/sysemu.h"
160 #include "sysemu/block-backend.h"
161 #include "sysemu/hostmem.h"
162 #include "hw/pci/msix.h"
163 #include "migration/vmstate.h"
164 
165 #include "nvme.h"
166 #include "trace.h"
167 
168 #define NVME_MAX_IOQPAIRS 0xffff
169 #define NVME_DB_SIZE  4
170 #define NVME_SPEC_VER 0x00010400
171 #define NVME_CMB_BIR 2
172 #define NVME_PMR_BIR 4
173 #define NVME_TEMPERATURE 0x143
174 #define NVME_TEMPERATURE_WARNING 0x157
175 #define NVME_TEMPERATURE_CRITICAL 0x175
176 #define NVME_NUM_FW_SLOTS 1
177 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
178 
179 #define NVME_GUEST_ERR(trace, fmt, ...) \
180     do { \
181         (trace_##trace)(__VA_ARGS__); \
182         qemu_log_mask(LOG_GUEST_ERROR, #trace \
183             " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
184     } while (0)
185 
186 static const bool nvme_feature_support[NVME_FID_MAX] = {
187     [NVME_ARBITRATION]              = true,
188     [NVME_POWER_MANAGEMENT]         = true,
189     [NVME_TEMPERATURE_THRESHOLD]    = true,
190     [NVME_ERROR_RECOVERY]           = true,
191     [NVME_VOLATILE_WRITE_CACHE]     = true,
192     [NVME_NUMBER_OF_QUEUES]         = true,
193     [NVME_INTERRUPT_COALESCING]     = true,
194     [NVME_INTERRUPT_VECTOR_CONF]    = true,
195     [NVME_WRITE_ATOMICITY]          = true,
196     [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
197     [NVME_TIMESTAMP]                = true,
198     [NVME_COMMAND_SET_PROFILE]      = true,
199 };
200 
201 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
202     [NVME_TEMPERATURE_THRESHOLD]    = NVME_FEAT_CAP_CHANGE,
203     [NVME_ERROR_RECOVERY]           = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
204     [NVME_VOLATILE_WRITE_CACHE]     = NVME_FEAT_CAP_CHANGE,
205     [NVME_NUMBER_OF_QUEUES]         = NVME_FEAT_CAP_CHANGE,
206     [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
207     [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
208     [NVME_COMMAND_SET_PROFILE]      = NVME_FEAT_CAP_CHANGE,
209 };
210 
211 static const uint32_t nvme_cse_acs[256] = {
212     [NVME_ADM_CMD_DELETE_SQ]        = NVME_CMD_EFF_CSUPP,
213     [NVME_ADM_CMD_CREATE_SQ]        = NVME_CMD_EFF_CSUPP,
214     [NVME_ADM_CMD_GET_LOG_PAGE]     = NVME_CMD_EFF_CSUPP,
215     [NVME_ADM_CMD_DELETE_CQ]        = NVME_CMD_EFF_CSUPP,
216     [NVME_ADM_CMD_CREATE_CQ]        = NVME_CMD_EFF_CSUPP,
217     [NVME_ADM_CMD_IDENTIFY]         = NVME_CMD_EFF_CSUPP,
218     [NVME_ADM_CMD_ABORT]            = NVME_CMD_EFF_CSUPP,
219     [NVME_ADM_CMD_SET_FEATURES]     = NVME_CMD_EFF_CSUPP,
220     [NVME_ADM_CMD_GET_FEATURES]     = NVME_CMD_EFF_CSUPP,
221     [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
222     [NVME_ADM_CMD_NS_ATTACHMENT]    = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
223     [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
224 };
225 
226 static const uint32_t nvme_cse_iocs_none[256];
227 
228 static const uint32_t nvme_cse_iocs_nvm[256] = {
229     [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
230     [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
231     [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
232     [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
233     [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
234     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
235     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
236     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
237 };
238 
239 static const uint32_t nvme_cse_iocs_zoned[256] = {
240     [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
241     [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
242     [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
243     [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
244     [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
245     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
246     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
247     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
248     [NVME_CMD_ZONE_APPEND]          = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
249     [NVME_CMD_ZONE_MGMT_SEND]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
250     [NVME_CMD_ZONE_MGMT_RECV]       = NVME_CMD_EFF_CSUPP,
251 };
252 
253 static void nvme_process_sq(void *opaque);
254 
255 static uint16_t nvme_sqid(NvmeRequest *req)
256 {
257     return le16_to_cpu(req->sq->sqid);
258 }
259 
260 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
261                                    NvmeZoneState state)
262 {
263     if (QTAILQ_IN_USE(zone, entry)) {
264         switch (nvme_get_zone_state(zone)) {
265         case NVME_ZONE_STATE_EXPLICITLY_OPEN:
266             QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
267             break;
268         case NVME_ZONE_STATE_IMPLICITLY_OPEN:
269             QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
270             break;
271         case NVME_ZONE_STATE_CLOSED:
272             QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
273             break;
274         case NVME_ZONE_STATE_FULL:
275             QTAILQ_REMOVE(&ns->full_zones, zone, entry);
276         default:
277             ;
278         }
279     }
280 
281     nvme_set_zone_state(zone, state);
282 
283     switch (state) {
284     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
285         QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
286         break;
287     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
288         QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
289         break;
290     case NVME_ZONE_STATE_CLOSED:
291         QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
292         break;
293     case NVME_ZONE_STATE_FULL:
294         QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
295     case NVME_ZONE_STATE_READ_ONLY:
296         break;
297     default:
298         zone->d.za = 0;
299     }
300 }
301 
302 /*
303  * Check if we can open a zone without exceeding open/active limits.
304  * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
305  */
306 static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
307 {
308     if (ns->params.max_active_zones != 0 &&
309         ns->nr_active_zones + act > ns->params.max_active_zones) {
310         trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
311         return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
312     }
313     if (ns->params.max_open_zones != 0 &&
314         ns->nr_open_zones + opn > ns->params.max_open_zones) {
315         trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
316         return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
317     }
318 
319     return NVME_SUCCESS;
320 }
321 
322 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
323 {
324     hwaddr hi, lo;
325 
326     if (!n->cmb.cmse) {
327         return false;
328     }
329 
330     lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
331     hi = lo + int128_get64(n->cmb.mem.size);
332 
333     return addr >= lo && addr < hi;
334 }
335 
336 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
337 {
338     hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
339     return &n->cmb.buf[addr - base];
340 }
341 
342 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
343 {
344     hwaddr hi;
345 
346     if (!n->pmr.cmse) {
347         return false;
348     }
349 
350     hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
351 
352     return addr >= n->pmr.cba && addr < hi;
353 }
354 
355 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
356 {
357     return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
358 }
359 
360 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
361 {
362     hwaddr hi = addr + size - 1;
363     if (hi < addr) {
364         return 1;
365     }
366 
367     if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
368         memcpy(buf, nvme_addr_to_cmb(n, addr), size);
369         return 0;
370     }
371 
372     if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
373         memcpy(buf, nvme_addr_to_pmr(n, addr), size);
374         return 0;
375     }
376 
377     return pci_dma_read(&n->parent_obj, addr, buf, size);
378 }
379 
380 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
381 {
382     hwaddr hi = addr + size - 1;
383     if (hi < addr) {
384         return 1;
385     }
386 
387     if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
388         memcpy(nvme_addr_to_cmb(n, addr), buf, size);
389         return 0;
390     }
391 
392     if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
393         memcpy(nvme_addr_to_pmr(n, addr), buf, size);
394         return 0;
395     }
396 
397     return pci_dma_write(&n->parent_obj, addr, buf, size);
398 }
399 
400 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
401 {
402     return nsid &&
403         (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
404 }
405 
406 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
407 {
408     return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
409 }
410 
411 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
412 {
413     return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
414 }
415 
416 static void nvme_inc_cq_tail(NvmeCQueue *cq)
417 {
418     cq->tail++;
419     if (cq->tail >= cq->size) {
420         cq->tail = 0;
421         cq->phase = !cq->phase;
422     }
423 }
424 
425 static void nvme_inc_sq_head(NvmeSQueue *sq)
426 {
427     sq->head = (sq->head + 1) % sq->size;
428 }
429 
430 static uint8_t nvme_cq_full(NvmeCQueue *cq)
431 {
432     return (cq->tail + 1) % cq->size == cq->head;
433 }
434 
435 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
436 {
437     return sq->head == sq->tail;
438 }
439 
440 static void nvme_irq_check(NvmeCtrl *n)
441 {
442     uint32_t intms = ldl_le_p(&n->bar.intms);
443 
444     if (msix_enabled(&(n->parent_obj))) {
445         return;
446     }
447     if (~intms & n->irq_status) {
448         pci_irq_assert(&n->parent_obj);
449     } else {
450         pci_irq_deassert(&n->parent_obj);
451     }
452 }
453 
454 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
455 {
456     if (cq->irq_enabled) {
457         if (msix_enabled(&(n->parent_obj))) {
458             trace_pci_nvme_irq_msix(cq->vector);
459             msix_notify(&(n->parent_obj), cq->vector);
460         } else {
461             trace_pci_nvme_irq_pin();
462             assert(cq->vector < 32);
463             n->irq_status |= 1 << cq->vector;
464             nvme_irq_check(n);
465         }
466     } else {
467         trace_pci_nvme_irq_masked();
468     }
469 }
470 
471 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
472 {
473     if (cq->irq_enabled) {
474         if (msix_enabled(&(n->parent_obj))) {
475             return;
476         } else {
477             assert(cq->vector < 32);
478             if (!n->cq_pending) {
479                 n->irq_status &= ~(1 << cq->vector);
480             }
481             nvme_irq_check(n);
482         }
483     }
484 }
485 
486 static void nvme_req_clear(NvmeRequest *req)
487 {
488     req->ns = NULL;
489     req->opaque = NULL;
490     req->aiocb = NULL;
491     memset(&req->cqe, 0x0, sizeof(req->cqe));
492     req->status = NVME_SUCCESS;
493 }
494 
495 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
496 {
497     if (dma) {
498         pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
499         sg->flags = NVME_SG_DMA;
500     } else {
501         qemu_iovec_init(&sg->iov, 0);
502     }
503 
504     sg->flags |= NVME_SG_ALLOC;
505 }
506 
507 static inline void nvme_sg_unmap(NvmeSg *sg)
508 {
509     if (!(sg->flags & NVME_SG_ALLOC)) {
510         return;
511     }
512 
513     if (sg->flags & NVME_SG_DMA) {
514         qemu_sglist_destroy(&sg->qsg);
515     } else {
516         qemu_iovec_destroy(&sg->iov);
517     }
518 
519     memset(sg, 0x0, sizeof(*sg));
520 }
521 
522 /*
523  * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
524  * holds both data and metadata. This function splits the data and metadata
525  * into two separate QSG/IOVs.
526  */
527 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
528                           NvmeSg *mdata)
529 {
530     NvmeSg *dst = data;
531     uint32_t trans_len, count = ns->lbasz;
532     uint64_t offset = 0;
533     bool dma = sg->flags & NVME_SG_DMA;
534     size_t sge_len;
535     size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
536     int sg_idx = 0;
537 
538     assert(sg->flags & NVME_SG_ALLOC);
539 
540     while (sg_len) {
541         sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
542 
543         trans_len = MIN(sg_len, count);
544         trans_len = MIN(trans_len, sge_len - offset);
545 
546         if (dst) {
547             if (dma) {
548                 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
549                                 trans_len);
550             } else {
551                 qemu_iovec_add(&dst->iov,
552                                sg->iov.iov[sg_idx].iov_base + offset,
553                                trans_len);
554             }
555         }
556 
557         sg_len -= trans_len;
558         count -= trans_len;
559         offset += trans_len;
560 
561         if (count == 0) {
562             dst = (dst == data) ? mdata : data;
563             count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
564         }
565 
566         if (sge_len == offset) {
567             offset = 0;
568             sg_idx++;
569         }
570     }
571 }
572 
573 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
574                                   size_t len)
575 {
576     if (!len) {
577         return NVME_SUCCESS;
578     }
579 
580     trace_pci_nvme_map_addr_cmb(addr, len);
581 
582     if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
583         return NVME_DATA_TRAS_ERROR;
584     }
585 
586     qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
587 
588     return NVME_SUCCESS;
589 }
590 
591 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
592                                   size_t len)
593 {
594     if (!len) {
595         return NVME_SUCCESS;
596     }
597 
598     if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
599         return NVME_DATA_TRAS_ERROR;
600     }
601 
602     qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
603 
604     return NVME_SUCCESS;
605 }
606 
607 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
608 {
609     bool cmb = false, pmr = false;
610 
611     if (!len) {
612         return NVME_SUCCESS;
613     }
614 
615     trace_pci_nvme_map_addr(addr, len);
616 
617     if (nvme_addr_is_cmb(n, addr)) {
618         cmb = true;
619     } else if (nvme_addr_is_pmr(n, addr)) {
620         pmr = true;
621     }
622 
623     if (cmb || pmr) {
624         if (sg->flags & NVME_SG_DMA) {
625             return NVME_INVALID_USE_OF_CMB | NVME_DNR;
626         }
627 
628         if (sg->iov.niov + 1 > IOV_MAX) {
629             goto max_mappings_exceeded;
630         }
631 
632         if (cmb) {
633             return nvme_map_addr_cmb(n, &sg->iov, addr, len);
634         } else {
635             return nvme_map_addr_pmr(n, &sg->iov, addr, len);
636         }
637     }
638 
639     if (!(sg->flags & NVME_SG_DMA)) {
640         return NVME_INVALID_USE_OF_CMB | NVME_DNR;
641     }
642 
643     if (sg->qsg.nsg + 1 > IOV_MAX) {
644         goto max_mappings_exceeded;
645     }
646 
647     qemu_sglist_add(&sg->qsg, addr, len);
648 
649     return NVME_SUCCESS;
650 
651 max_mappings_exceeded:
652     NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
653                    "number of mappings exceed 1024");
654     return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
655 }
656 
657 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
658 {
659     return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
660 }
661 
662 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
663                              uint64_t prp2, uint32_t len)
664 {
665     hwaddr trans_len = n->page_size - (prp1 % n->page_size);
666     trans_len = MIN(len, trans_len);
667     int num_prps = (len >> n->page_bits) + 1;
668     uint16_t status;
669     int ret;
670 
671     trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
672 
673     nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
674 
675     status = nvme_map_addr(n, sg, prp1, trans_len);
676     if (status) {
677         goto unmap;
678     }
679 
680     len -= trans_len;
681     if (len) {
682         if (len > n->page_size) {
683             uint64_t prp_list[n->max_prp_ents];
684             uint32_t nents, prp_trans;
685             int i = 0;
686 
687             /*
688              * The first PRP list entry, pointed to by PRP2 may contain offset.
689              * Hence, we need to calculate the number of entries in based on
690              * that offset.
691              */
692             nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
693             prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
694             ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
695             if (ret) {
696                 trace_pci_nvme_err_addr_read(prp2);
697                 status = NVME_DATA_TRAS_ERROR;
698                 goto unmap;
699             }
700             while (len != 0) {
701                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
702 
703                 if (i == nents - 1 && len > n->page_size) {
704                     if (unlikely(prp_ent & (n->page_size - 1))) {
705                         trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
706                         status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
707                         goto unmap;
708                     }
709 
710                     i = 0;
711                     nents = (len + n->page_size - 1) >> n->page_bits;
712                     nents = MIN(nents, n->max_prp_ents);
713                     prp_trans = nents * sizeof(uint64_t);
714                     ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
715                                          prp_trans);
716                     if (ret) {
717                         trace_pci_nvme_err_addr_read(prp_ent);
718                         status = NVME_DATA_TRAS_ERROR;
719                         goto unmap;
720                     }
721                     prp_ent = le64_to_cpu(prp_list[i]);
722                 }
723 
724                 if (unlikely(prp_ent & (n->page_size - 1))) {
725                     trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
726                     status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
727                     goto unmap;
728                 }
729 
730                 trans_len = MIN(len, n->page_size);
731                 status = nvme_map_addr(n, sg, prp_ent, trans_len);
732                 if (status) {
733                     goto unmap;
734                 }
735 
736                 len -= trans_len;
737                 i++;
738             }
739         } else {
740             if (unlikely(prp2 & (n->page_size - 1))) {
741                 trace_pci_nvme_err_invalid_prp2_align(prp2);
742                 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
743                 goto unmap;
744             }
745             status = nvme_map_addr(n, sg, prp2, len);
746             if (status) {
747                 goto unmap;
748             }
749         }
750     }
751 
752     return NVME_SUCCESS;
753 
754 unmap:
755     nvme_sg_unmap(sg);
756     return status;
757 }
758 
759 /*
760  * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
761  * number of bytes mapped in len.
762  */
763 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
764                                   NvmeSglDescriptor *segment, uint64_t nsgld,
765                                   size_t *len, NvmeCmd *cmd)
766 {
767     dma_addr_t addr, trans_len;
768     uint32_t dlen;
769     uint16_t status;
770 
771     for (int i = 0; i < nsgld; i++) {
772         uint8_t type = NVME_SGL_TYPE(segment[i].type);
773 
774         switch (type) {
775         case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
776             if (cmd->opcode == NVME_CMD_WRITE) {
777                 continue;
778             }
779         case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
780             break;
781         case NVME_SGL_DESCR_TYPE_SEGMENT:
782         case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
783             return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
784         default:
785             return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
786         }
787 
788         dlen = le32_to_cpu(segment[i].len);
789 
790         if (!dlen) {
791             continue;
792         }
793 
794         if (*len == 0) {
795             /*
796              * All data has been mapped, but the SGL contains additional
797              * segments and/or descriptors. The controller might accept
798              * ignoring the rest of the SGL.
799              */
800             uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
801             if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
802                 break;
803             }
804 
805             trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
806             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
807         }
808 
809         trans_len = MIN(*len, dlen);
810 
811         if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
812             goto next;
813         }
814 
815         addr = le64_to_cpu(segment[i].addr);
816 
817         if (UINT64_MAX - addr < dlen) {
818             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
819         }
820 
821         status = nvme_map_addr(n, sg, addr, trans_len);
822         if (status) {
823             return status;
824         }
825 
826 next:
827         *len -= trans_len;
828     }
829 
830     return NVME_SUCCESS;
831 }
832 
833 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
834                              size_t len, NvmeCmd *cmd)
835 {
836     /*
837      * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
838      * dynamically allocating a potentially huge SGL. The spec allows the SGL
839      * to be larger (as in number of bytes required to describe the SGL
840      * descriptors and segment chain) than the command transfer size, so it is
841      * not bounded by MDTS.
842      */
843     const int SEG_CHUNK_SIZE = 256;
844 
845     NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
846     uint64_t nsgld;
847     uint32_t seg_len;
848     uint16_t status;
849     hwaddr addr;
850     int ret;
851 
852     sgld = &sgl;
853     addr = le64_to_cpu(sgl.addr);
854 
855     trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
856 
857     nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
858 
859     /*
860      * If the entire transfer can be described with a single data block it can
861      * be mapped directly.
862      */
863     if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
864         status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
865         if (status) {
866             goto unmap;
867         }
868 
869         goto out;
870     }
871 
872     for (;;) {
873         switch (NVME_SGL_TYPE(sgld->type)) {
874         case NVME_SGL_DESCR_TYPE_SEGMENT:
875         case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
876             break;
877         default:
878             return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
879         }
880 
881         seg_len = le32_to_cpu(sgld->len);
882 
883         /* check the length of the (Last) Segment descriptor */
884         if ((!seg_len || seg_len & 0xf) &&
885             (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
886             return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
887         }
888 
889         if (UINT64_MAX - addr < seg_len) {
890             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
891         }
892 
893         nsgld = seg_len / sizeof(NvmeSglDescriptor);
894 
895         while (nsgld > SEG_CHUNK_SIZE) {
896             if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
897                 trace_pci_nvme_err_addr_read(addr);
898                 status = NVME_DATA_TRAS_ERROR;
899                 goto unmap;
900             }
901 
902             status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
903                                        &len, cmd);
904             if (status) {
905                 goto unmap;
906             }
907 
908             nsgld -= SEG_CHUNK_SIZE;
909             addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
910         }
911 
912         ret = nvme_addr_read(n, addr, segment, nsgld *
913                              sizeof(NvmeSglDescriptor));
914         if (ret) {
915             trace_pci_nvme_err_addr_read(addr);
916             status = NVME_DATA_TRAS_ERROR;
917             goto unmap;
918         }
919 
920         last_sgld = &segment[nsgld - 1];
921 
922         /*
923          * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
924          * then we are done.
925          */
926         switch (NVME_SGL_TYPE(last_sgld->type)) {
927         case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
928         case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
929             status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
930             if (status) {
931                 goto unmap;
932             }
933 
934             goto out;
935 
936         default:
937             break;
938         }
939 
940         /*
941          * If the last descriptor was not a Data Block or Bit Bucket, then the
942          * current segment must not be a Last Segment.
943          */
944         if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
945             status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
946             goto unmap;
947         }
948 
949         sgld = last_sgld;
950         addr = le64_to_cpu(sgld->addr);
951 
952         /*
953          * Do not map the last descriptor; it will be a Segment or Last Segment
954          * descriptor and is handled by the next iteration.
955          */
956         status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
957         if (status) {
958             goto unmap;
959         }
960     }
961 
962 out:
963     /* if there is any residual left in len, the SGL was too short */
964     if (len) {
965         status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
966         goto unmap;
967     }
968 
969     return NVME_SUCCESS;
970 
971 unmap:
972     nvme_sg_unmap(sg);
973     return status;
974 }
975 
976 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
977                        NvmeCmd *cmd)
978 {
979     uint64_t prp1, prp2;
980 
981     switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
982     case NVME_PSDT_PRP:
983         prp1 = le64_to_cpu(cmd->dptr.prp1);
984         prp2 = le64_to_cpu(cmd->dptr.prp2);
985 
986         return nvme_map_prp(n, sg, prp1, prp2, len);
987     case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
988     case NVME_PSDT_SGL_MPTR_SGL:
989         return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
990     default:
991         return NVME_INVALID_FIELD;
992     }
993 }
994 
995 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
996                               NvmeCmd *cmd)
997 {
998     int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
999     hwaddr mptr = le64_to_cpu(cmd->mptr);
1000     uint16_t status;
1001 
1002     if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1003         NvmeSglDescriptor sgl;
1004 
1005         if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1006             return NVME_DATA_TRAS_ERROR;
1007         }
1008 
1009         status = nvme_map_sgl(n, sg, sgl, len, cmd);
1010         if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1011             status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1012         }
1013 
1014         return status;
1015     }
1016 
1017     nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1018     status = nvme_map_addr(n, sg, mptr, len);
1019     if (status) {
1020         nvme_sg_unmap(sg);
1021     }
1022 
1023     return status;
1024 }
1025 
1026 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1027 {
1028     NvmeNamespace *ns = req->ns;
1029     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1030     bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1031     bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1032     size_t len = nvme_l2b(ns, nlb);
1033     uint16_t status;
1034 
1035     if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1036         NvmeSg sg;
1037 
1038         len += nvme_m2b(ns, nlb);
1039 
1040         status = nvme_map_dptr(n, &sg, len, &req->cmd);
1041         if (status) {
1042             return status;
1043         }
1044 
1045         nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1046         nvme_sg_split(&sg, ns, &req->sg, NULL);
1047         nvme_sg_unmap(&sg);
1048 
1049         return NVME_SUCCESS;
1050     }
1051 
1052     return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1053 }
1054 
1055 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1056 {
1057     NvmeNamespace *ns = req->ns;
1058     size_t len = nvme_m2b(ns, nlb);
1059     uint16_t status;
1060 
1061     if (nvme_ns_ext(ns)) {
1062         NvmeSg sg;
1063 
1064         len += nvme_l2b(ns, nlb);
1065 
1066         status = nvme_map_dptr(n, &sg, len, &req->cmd);
1067         if (status) {
1068             return status;
1069         }
1070 
1071         nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1072         nvme_sg_split(&sg, ns, NULL, &req->sg);
1073         nvme_sg_unmap(&sg);
1074 
1075         return NVME_SUCCESS;
1076     }
1077 
1078     return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1079 }
1080 
1081 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1082                                     uint32_t len, uint32_t bytes,
1083                                     int32_t skip_bytes, int64_t offset,
1084                                     NvmeTxDirection dir)
1085 {
1086     hwaddr addr;
1087     uint32_t trans_len, count = bytes;
1088     bool dma = sg->flags & NVME_SG_DMA;
1089     int64_t sge_len;
1090     int sg_idx = 0;
1091     int ret;
1092 
1093     assert(sg->flags & NVME_SG_ALLOC);
1094 
1095     while (len) {
1096         sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1097 
1098         if (sge_len - offset < 0) {
1099             offset -= sge_len;
1100             sg_idx++;
1101             continue;
1102         }
1103 
1104         if (sge_len == offset) {
1105             offset = 0;
1106             sg_idx++;
1107             continue;
1108         }
1109 
1110         trans_len = MIN(len, count);
1111         trans_len = MIN(trans_len, sge_len - offset);
1112 
1113         if (dma) {
1114             addr = sg->qsg.sg[sg_idx].base + offset;
1115         } else {
1116             addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1117         }
1118 
1119         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1120             ret = nvme_addr_read(n, addr, ptr, trans_len);
1121         } else {
1122             ret = nvme_addr_write(n, addr, ptr, trans_len);
1123         }
1124 
1125         if (ret) {
1126             return NVME_DATA_TRAS_ERROR;
1127         }
1128 
1129         ptr += trans_len;
1130         len -= trans_len;
1131         count -= trans_len;
1132         offset += trans_len;
1133 
1134         if (count == 0) {
1135             count = bytes;
1136             offset += skip_bytes;
1137         }
1138     }
1139 
1140     return NVME_SUCCESS;
1141 }
1142 
1143 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
1144                         NvmeTxDirection dir)
1145 {
1146     assert(sg->flags & NVME_SG_ALLOC);
1147 
1148     if (sg->flags & NVME_SG_DMA) {
1149         uint64_t residual;
1150 
1151         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1152             residual = dma_buf_write(ptr, len, &sg->qsg);
1153         } else {
1154             residual = dma_buf_read(ptr, len, &sg->qsg);
1155         }
1156 
1157         if (unlikely(residual)) {
1158             trace_pci_nvme_err_invalid_dma();
1159             return NVME_INVALID_FIELD | NVME_DNR;
1160         }
1161     } else {
1162         size_t bytes;
1163 
1164         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1165             bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1166         } else {
1167             bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1168         }
1169 
1170         if (unlikely(bytes != len)) {
1171             trace_pci_nvme_err_invalid_dma();
1172             return NVME_INVALID_FIELD | NVME_DNR;
1173         }
1174     }
1175 
1176     return NVME_SUCCESS;
1177 }
1178 
1179 static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1180                                 NvmeRequest *req)
1181 {
1182     uint16_t status;
1183 
1184     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1185     if (status) {
1186         return status;
1187     }
1188 
1189     return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1190 }
1191 
1192 static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1193                                 NvmeRequest *req)
1194 {
1195     uint16_t status;
1196 
1197     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1198     if (status) {
1199         return status;
1200     }
1201 
1202     return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1203 }
1204 
1205 uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1206                           NvmeTxDirection dir, NvmeRequest *req)
1207 {
1208     NvmeNamespace *ns = req->ns;
1209     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1210     bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1211     bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1212 
1213     if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1214         return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1215                                    ns->lbaf.ms, 0, dir);
1216     }
1217 
1218     return nvme_tx(n, &req->sg, ptr, len, dir);
1219 }
1220 
1221 uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1222                            NvmeTxDirection dir, NvmeRequest *req)
1223 {
1224     NvmeNamespace *ns = req->ns;
1225     uint16_t status;
1226 
1227     if (nvme_ns_ext(ns)) {
1228         return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1229                                    ns->lbasz, ns->lbasz, dir);
1230     }
1231 
1232     nvme_sg_unmap(&req->sg);
1233 
1234     status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1235     if (status) {
1236         return status;
1237     }
1238 
1239     return nvme_tx(n, &req->sg, ptr, len, dir);
1240 }
1241 
1242 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1243                                  BlockCompletionFunc *cb, NvmeRequest *req)
1244 {
1245     assert(req->sg.flags & NVME_SG_ALLOC);
1246 
1247     if (req->sg.flags & NVME_SG_DMA) {
1248         req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1249                                   cb, req);
1250     } else {
1251         req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1252     }
1253 }
1254 
1255 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1256                                   BlockCompletionFunc *cb, NvmeRequest *req)
1257 {
1258     assert(req->sg.flags & NVME_SG_ALLOC);
1259 
1260     if (req->sg.flags & NVME_SG_DMA) {
1261         req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1262                                    cb, req);
1263     } else {
1264         req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1265     }
1266 }
1267 
1268 static void nvme_post_cqes(void *opaque)
1269 {
1270     NvmeCQueue *cq = opaque;
1271     NvmeCtrl *n = cq->ctrl;
1272     NvmeRequest *req, *next;
1273     bool pending = cq->head != cq->tail;
1274     int ret;
1275 
1276     QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1277         NvmeSQueue *sq;
1278         hwaddr addr;
1279 
1280         if (nvme_cq_full(cq)) {
1281             break;
1282         }
1283 
1284         sq = req->sq;
1285         req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1286         req->cqe.sq_id = cpu_to_le16(sq->sqid);
1287         req->cqe.sq_head = cpu_to_le16(sq->head);
1288         addr = cq->dma_addr + cq->tail * n->cqe_size;
1289         ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1290                             sizeof(req->cqe));
1291         if (ret) {
1292             trace_pci_nvme_err_addr_write(addr);
1293             trace_pci_nvme_err_cfs();
1294             stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1295             break;
1296         }
1297         QTAILQ_REMOVE(&cq->req_list, req, entry);
1298         nvme_inc_cq_tail(cq);
1299         nvme_sg_unmap(&req->sg);
1300         QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1301     }
1302     if (cq->tail != cq->head) {
1303         if (cq->irq_enabled && !pending) {
1304             n->cq_pending++;
1305         }
1306 
1307         nvme_irq_assert(n, cq);
1308     }
1309 }
1310 
1311 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1312 {
1313     assert(cq->cqid == req->sq->cqid);
1314     trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1315                                           le32_to_cpu(req->cqe.result),
1316                                           le32_to_cpu(req->cqe.dw1),
1317                                           req->status);
1318 
1319     if (req->status) {
1320         trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1321                                       req->status, req->cmd.opcode);
1322     }
1323 
1324     QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1325     QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1326     timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1327 }
1328 
1329 static void nvme_process_aers(void *opaque)
1330 {
1331     NvmeCtrl *n = opaque;
1332     NvmeAsyncEvent *event, *next;
1333 
1334     trace_pci_nvme_process_aers(n->aer_queued);
1335 
1336     QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1337         NvmeRequest *req;
1338         NvmeAerResult *result;
1339 
1340         /* can't post cqe if there is nothing to complete */
1341         if (!n->outstanding_aers) {
1342             trace_pci_nvme_no_outstanding_aers();
1343             break;
1344         }
1345 
1346         /* ignore if masked (cqe posted, but event not cleared) */
1347         if (n->aer_mask & (1 << event->result.event_type)) {
1348             trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1349             continue;
1350         }
1351 
1352         QTAILQ_REMOVE(&n->aer_queue, event, entry);
1353         n->aer_queued--;
1354 
1355         n->aer_mask |= 1 << event->result.event_type;
1356         n->outstanding_aers--;
1357 
1358         req = n->aer_reqs[n->outstanding_aers];
1359 
1360         result = (NvmeAerResult *) &req->cqe.result;
1361         result->event_type = event->result.event_type;
1362         result->event_info = event->result.event_info;
1363         result->log_page = event->result.log_page;
1364         g_free(event);
1365 
1366         trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1367                                     result->log_page);
1368 
1369         nvme_enqueue_req_completion(&n->admin_cq, req);
1370     }
1371 }
1372 
1373 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1374                                uint8_t event_info, uint8_t log_page)
1375 {
1376     NvmeAsyncEvent *event;
1377 
1378     trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1379 
1380     if (n->aer_queued == n->params.aer_max_queued) {
1381         trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1382         return;
1383     }
1384 
1385     event = g_new(NvmeAsyncEvent, 1);
1386     event->result = (NvmeAerResult) {
1387         .event_type = event_type,
1388         .event_info = event_info,
1389         .log_page   = log_page,
1390     };
1391 
1392     QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1393     n->aer_queued++;
1394 
1395     nvme_process_aers(n);
1396 }
1397 
1398 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1399 {
1400     uint8_t aer_info;
1401 
1402     /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1403     if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1404         return;
1405     }
1406 
1407     switch (event) {
1408     case NVME_SMART_SPARE:
1409         aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1410         break;
1411     case NVME_SMART_TEMPERATURE:
1412         aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1413         break;
1414     case NVME_SMART_RELIABILITY:
1415     case NVME_SMART_MEDIA_READ_ONLY:
1416     case NVME_SMART_FAILED_VOLATILE_MEDIA:
1417     case NVME_SMART_PMR_UNRELIABLE:
1418         aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1419         break;
1420     default:
1421         return;
1422     }
1423 
1424     nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1425 }
1426 
1427 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1428 {
1429     n->aer_mask &= ~(1 << event_type);
1430     if (!QTAILQ_EMPTY(&n->aer_queue)) {
1431         nvme_process_aers(n);
1432     }
1433 }
1434 
1435 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1436 {
1437     uint8_t mdts = n->params.mdts;
1438 
1439     if (mdts && len > n->page_size << mdts) {
1440         trace_pci_nvme_err_mdts(len);
1441         return NVME_INVALID_FIELD | NVME_DNR;
1442     }
1443 
1444     return NVME_SUCCESS;
1445 }
1446 
1447 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1448                                          uint32_t nlb)
1449 {
1450     uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1451 
1452     if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1453         trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1454         return NVME_LBA_RANGE | NVME_DNR;
1455     }
1456 
1457     return NVME_SUCCESS;
1458 }
1459 
1460 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1461                                  uint32_t nlb, int flags)
1462 {
1463     BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1464 
1465     int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1466     int64_t offset = nvme_l2b(ns, slba);
1467     int ret;
1468 
1469     /*
1470      * `pnum` holds the number of bytes after offset that shares the same
1471      * allocation status as the byte at offset. If `pnum` is different from
1472      * `bytes`, we should check the allocation status of the next range and
1473      * continue this until all bytes have been checked.
1474      */
1475     do {
1476         bytes -= pnum;
1477 
1478         ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1479         if (ret < 0) {
1480             return ret;
1481         }
1482 
1483 
1484         trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1485                                     !!(ret & BDRV_BLOCK_ZERO));
1486 
1487         if (!(ret & flags)) {
1488             return 1;
1489         }
1490 
1491         offset += pnum;
1492     } while (pnum != bytes);
1493 
1494     return 0;
1495 }
1496 
1497 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1498                                  uint32_t nlb)
1499 {
1500     int ret;
1501     Error *err = NULL;
1502 
1503     ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1504     if (ret) {
1505         if (ret < 0) {
1506             error_setg_errno(&err, -ret, "unable to get block status");
1507             error_report_err(err);
1508 
1509             return NVME_INTERNAL_DEV_ERROR;
1510         }
1511 
1512         return NVME_DULB;
1513     }
1514 
1515     return NVME_SUCCESS;
1516 }
1517 
1518 static void nvme_aio_err(NvmeRequest *req, int ret)
1519 {
1520     uint16_t status = NVME_SUCCESS;
1521     Error *local_err = NULL;
1522 
1523     switch (req->cmd.opcode) {
1524     case NVME_CMD_READ:
1525         status = NVME_UNRECOVERED_READ;
1526         break;
1527     case NVME_CMD_FLUSH:
1528     case NVME_CMD_WRITE:
1529     case NVME_CMD_WRITE_ZEROES:
1530     case NVME_CMD_ZONE_APPEND:
1531         status = NVME_WRITE_FAULT;
1532         break;
1533     default:
1534         status = NVME_INTERNAL_DEV_ERROR;
1535         break;
1536     }
1537 
1538     trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1539 
1540     error_setg_errno(&local_err, -ret, "aio failed");
1541     error_report_err(local_err);
1542 
1543     /*
1544      * Set the command status code to the first encountered error but allow a
1545      * subsequent Internal Device Error to trump it.
1546      */
1547     if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1548         return;
1549     }
1550 
1551     req->status = status;
1552 }
1553 
1554 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1555 {
1556     return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1557                                     slba / ns->zone_size;
1558 }
1559 
1560 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1561 {
1562     uint32_t zone_idx = nvme_zone_idx(ns, slba);
1563 
1564     if (zone_idx >= ns->num_zones) {
1565         return NULL;
1566     }
1567 
1568     return &ns->zone_array[zone_idx];
1569 }
1570 
1571 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1572 {
1573     uint64_t zslba = zone->d.zslba;
1574 
1575     switch (nvme_get_zone_state(zone)) {
1576     case NVME_ZONE_STATE_EMPTY:
1577     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1578     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1579     case NVME_ZONE_STATE_CLOSED:
1580         return NVME_SUCCESS;
1581     case NVME_ZONE_STATE_FULL:
1582         trace_pci_nvme_err_zone_is_full(zslba);
1583         return NVME_ZONE_FULL;
1584     case NVME_ZONE_STATE_OFFLINE:
1585         trace_pci_nvme_err_zone_is_offline(zslba);
1586         return NVME_ZONE_OFFLINE;
1587     case NVME_ZONE_STATE_READ_ONLY:
1588         trace_pci_nvme_err_zone_is_read_only(zslba);
1589         return NVME_ZONE_READ_ONLY;
1590     default:
1591         assert(false);
1592     }
1593 
1594     return NVME_INTERNAL_DEV_ERROR;
1595 }
1596 
1597 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1598                                       uint64_t slba, uint32_t nlb)
1599 {
1600     uint64_t zcap = nvme_zone_wr_boundary(zone);
1601     uint16_t status;
1602 
1603     status = nvme_check_zone_state_for_write(zone);
1604     if (status) {
1605         return status;
1606     }
1607 
1608     if (unlikely(slba != zone->w_ptr)) {
1609         trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
1610         return NVME_ZONE_INVALID_WRITE;
1611     }
1612 
1613     if (unlikely((slba + nlb) > zcap)) {
1614         trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1615         return NVME_ZONE_BOUNDARY_ERROR;
1616     }
1617 
1618     return NVME_SUCCESS;
1619 }
1620 
1621 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1622 {
1623     switch (nvme_get_zone_state(zone)) {
1624     case NVME_ZONE_STATE_EMPTY:
1625     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1626     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1627     case NVME_ZONE_STATE_FULL:
1628     case NVME_ZONE_STATE_CLOSED:
1629     case NVME_ZONE_STATE_READ_ONLY:
1630         return NVME_SUCCESS;
1631     case NVME_ZONE_STATE_OFFLINE:
1632         trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1633         return NVME_ZONE_OFFLINE;
1634     default:
1635         assert(false);
1636     }
1637 
1638     return NVME_INTERNAL_DEV_ERROR;
1639 }
1640 
1641 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1642                                      uint32_t nlb)
1643 {
1644     NvmeZone *zone;
1645     uint64_t bndry, end;
1646     uint16_t status;
1647 
1648     zone = nvme_get_zone_by_slba(ns, slba);
1649     assert(zone);
1650 
1651     bndry = nvme_zone_rd_boundary(ns, zone);
1652     end = slba + nlb;
1653 
1654     status = nvme_check_zone_state_for_read(zone);
1655     if (status) {
1656         ;
1657     } else if (unlikely(end > bndry)) {
1658         if (!ns->params.cross_zone_read) {
1659             status = NVME_ZONE_BOUNDARY_ERROR;
1660         } else {
1661             /*
1662              * Read across zone boundary - check that all subsequent
1663              * zones that are being read have an appropriate state.
1664              */
1665             do {
1666                 zone++;
1667                 status = nvme_check_zone_state_for_read(zone);
1668                 if (status) {
1669                     break;
1670                 }
1671             } while (end > nvme_zone_rd_boundary(ns, zone));
1672         }
1673     }
1674 
1675     return status;
1676 }
1677 
1678 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1679 {
1680     switch (nvme_get_zone_state(zone)) {
1681     case NVME_ZONE_STATE_FULL:
1682         return NVME_SUCCESS;
1683 
1684     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1685     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1686         nvme_aor_dec_open(ns);
1687         /* fallthrough */
1688     case NVME_ZONE_STATE_CLOSED:
1689         nvme_aor_dec_active(ns);
1690         /* fallthrough */
1691     case NVME_ZONE_STATE_EMPTY:
1692         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1693         return NVME_SUCCESS;
1694 
1695     default:
1696         return NVME_ZONE_INVAL_TRANSITION;
1697     }
1698 }
1699 
1700 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1701 {
1702     switch (nvme_get_zone_state(zone)) {
1703     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1704     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1705         nvme_aor_dec_open(ns);
1706         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1707         /* fall through */
1708     case NVME_ZONE_STATE_CLOSED:
1709         return NVME_SUCCESS;
1710 
1711     default:
1712         return NVME_ZONE_INVAL_TRANSITION;
1713     }
1714 }
1715 
1716 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1717 {
1718     switch (nvme_get_zone_state(zone)) {
1719     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1720     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1721         nvme_aor_dec_open(ns);
1722         /* fallthrough */
1723     case NVME_ZONE_STATE_CLOSED:
1724         nvme_aor_dec_active(ns);
1725         /* fallthrough */
1726     case NVME_ZONE_STATE_FULL:
1727         zone->w_ptr = zone->d.zslba;
1728         zone->d.wp = zone->w_ptr;
1729         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1730         /* fallthrough */
1731     case NVME_ZONE_STATE_EMPTY:
1732         return NVME_SUCCESS;
1733 
1734     default:
1735         return NVME_ZONE_INVAL_TRANSITION;
1736     }
1737 }
1738 
1739 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1740 {
1741     NvmeZone *zone;
1742 
1743     if (ns->params.max_open_zones &&
1744         ns->nr_open_zones == ns->params.max_open_zones) {
1745         zone = QTAILQ_FIRST(&ns->imp_open_zones);
1746         if (zone) {
1747             /*
1748              * Automatically close this implicitly open zone.
1749              */
1750             QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1751             nvme_zrm_close(ns, zone);
1752         }
1753     }
1754 }
1755 
1756 enum {
1757     NVME_ZRM_AUTO = 1 << 0,
1758 };
1759 
1760 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1761                                     NvmeZone *zone, int flags)
1762 {
1763     int act = 0;
1764     uint16_t status;
1765 
1766     switch (nvme_get_zone_state(zone)) {
1767     case NVME_ZONE_STATE_EMPTY:
1768         act = 1;
1769 
1770         /* fallthrough */
1771 
1772     case NVME_ZONE_STATE_CLOSED:
1773         if (n->params.auto_transition_zones) {
1774             nvme_zrm_auto_transition_zone(ns);
1775         }
1776         status = nvme_aor_check(ns, act, 1);
1777         if (status) {
1778             return status;
1779         }
1780 
1781         if (act) {
1782             nvme_aor_inc_active(ns);
1783         }
1784 
1785         nvme_aor_inc_open(ns);
1786 
1787         if (flags & NVME_ZRM_AUTO) {
1788             nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1789             return NVME_SUCCESS;
1790         }
1791 
1792         /* fallthrough */
1793 
1794     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1795         if (flags & NVME_ZRM_AUTO) {
1796             return NVME_SUCCESS;
1797         }
1798 
1799         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1800 
1801         /* fallthrough */
1802 
1803     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1804         return NVME_SUCCESS;
1805 
1806     default:
1807         return NVME_ZONE_INVAL_TRANSITION;
1808     }
1809 }
1810 
1811 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
1812                                      NvmeZone *zone)
1813 {
1814     return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
1815 }
1816 
1817 static inline uint16_t nvme_zrm_open(NvmeCtrl *n, NvmeNamespace *ns,
1818                                      NvmeZone *zone)
1819 {
1820     return nvme_zrm_open_flags(n, ns, zone, 0);
1821 }
1822 
1823 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1824                                  uint32_t nlb)
1825 {
1826     zone->d.wp += nlb;
1827 
1828     if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1829         nvme_zrm_finish(ns, zone);
1830     }
1831 }
1832 
1833 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1834 {
1835     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1836     NvmeZone *zone;
1837     uint64_t slba;
1838     uint32_t nlb;
1839 
1840     slba = le64_to_cpu(rw->slba);
1841     nlb = le16_to_cpu(rw->nlb) + 1;
1842     zone = nvme_get_zone_by_slba(ns, slba);
1843     assert(zone);
1844 
1845     nvme_advance_zone_wp(ns, zone, nlb);
1846 }
1847 
1848 static inline bool nvme_is_write(NvmeRequest *req)
1849 {
1850     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1851 
1852     return rw->opcode == NVME_CMD_WRITE ||
1853            rw->opcode == NVME_CMD_ZONE_APPEND ||
1854            rw->opcode == NVME_CMD_WRITE_ZEROES;
1855 }
1856 
1857 static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
1858 {
1859     return qemu_get_aio_context();
1860 }
1861 
1862 static void nvme_misc_cb(void *opaque, int ret)
1863 {
1864     NvmeRequest *req = opaque;
1865 
1866     trace_pci_nvme_misc_cb(nvme_cid(req));
1867 
1868     if (ret) {
1869         nvme_aio_err(req, ret);
1870     }
1871 
1872     nvme_enqueue_req_completion(nvme_cq(req), req);
1873 }
1874 
1875 void nvme_rw_complete_cb(void *opaque, int ret)
1876 {
1877     NvmeRequest *req = opaque;
1878     NvmeNamespace *ns = req->ns;
1879     BlockBackend *blk = ns->blkconf.blk;
1880     BlockAcctCookie *acct = &req->acct;
1881     BlockAcctStats *stats = blk_get_stats(blk);
1882 
1883     trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1884 
1885     if (ret) {
1886         block_acct_failed(stats, acct);
1887         nvme_aio_err(req, ret);
1888     } else {
1889         block_acct_done(stats, acct);
1890     }
1891 
1892     if (ns->params.zoned && nvme_is_write(req)) {
1893         nvme_finalize_zoned_write(ns, req);
1894     }
1895 
1896     nvme_enqueue_req_completion(nvme_cq(req), req);
1897 }
1898 
1899 static void nvme_rw_cb(void *opaque, int ret)
1900 {
1901     NvmeRequest *req = opaque;
1902     NvmeNamespace *ns = req->ns;
1903 
1904     BlockBackend *blk = ns->blkconf.blk;
1905 
1906     trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1907 
1908     if (ret) {
1909         goto out;
1910     }
1911 
1912     if (ns->lbaf.ms) {
1913         NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1914         uint64_t slba = le64_to_cpu(rw->slba);
1915         uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
1916         uint64_t offset = nvme_moff(ns, slba);
1917 
1918         if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
1919             size_t mlen = nvme_m2b(ns, nlb);
1920 
1921             req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
1922                                                BDRV_REQ_MAY_UNMAP,
1923                                                nvme_rw_complete_cb, req);
1924             return;
1925         }
1926 
1927         if (nvme_ns_ext(ns) || req->cmd.mptr) {
1928             uint16_t status;
1929 
1930             nvme_sg_unmap(&req->sg);
1931             status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
1932             if (status) {
1933                 ret = -EFAULT;
1934                 goto out;
1935             }
1936 
1937             if (req->cmd.opcode == NVME_CMD_READ) {
1938                 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
1939             }
1940 
1941             return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
1942         }
1943     }
1944 
1945 out:
1946     nvme_rw_complete_cb(req, ret);
1947 }
1948 
1949 static void nvme_verify_cb(void *opaque, int ret)
1950 {
1951     NvmeBounceContext *ctx = opaque;
1952     NvmeRequest *req = ctx->req;
1953     NvmeNamespace *ns = req->ns;
1954     BlockBackend *blk = ns->blkconf.blk;
1955     BlockAcctCookie *acct = &req->acct;
1956     BlockAcctStats *stats = blk_get_stats(blk);
1957     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1958     uint64_t slba = le64_to_cpu(rw->slba);
1959     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
1960     uint16_t apptag = le16_to_cpu(rw->apptag);
1961     uint16_t appmask = le16_to_cpu(rw->appmask);
1962     uint32_t reftag = le32_to_cpu(rw->reftag);
1963     uint16_t status;
1964 
1965     trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
1966 
1967     if (ret) {
1968         block_acct_failed(stats, acct);
1969         nvme_aio_err(req, ret);
1970         goto out;
1971     }
1972 
1973     block_acct_done(stats, acct);
1974 
1975     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
1976         status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
1977                                        ctx->mdata.iov.size, slba);
1978         if (status) {
1979             req->status = status;
1980             goto out;
1981         }
1982 
1983         req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
1984                                      ctx->mdata.bounce, ctx->mdata.iov.size,
1985                                      prinfo, slba, apptag, appmask, &reftag);
1986     }
1987 
1988 out:
1989     qemu_iovec_destroy(&ctx->data.iov);
1990     g_free(ctx->data.bounce);
1991 
1992     qemu_iovec_destroy(&ctx->mdata.iov);
1993     g_free(ctx->mdata.bounce);
1994 
1995     g_free(ctx);
1996 
1997     nvme_enqueue_req_completion(nvme_cq(req), req);
1998 }
1999 
2000 
2001 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2002 {
2003     NvmeBounceContext *ctx = opaque;
2004     NvmeRequest *req = ctx->req;
2005     NvmeNamespace *ns = req->ns;
2006     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2007     uint64_t slba = le64_to_cpu(rw->slba);
2008     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2009     size_t mlen = nvme_m2b(ns, nlb);
2010     uint64_t offset = nvme_moff(ns, slba);
2011     BlockBackend *blk = ns->blkconf.blk;
2012 
2013     trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2014 
2015     if (ret) {
2016         goto out;
2017     }
2018 
2019     ctx->mdata.bounce = g_malloc(mlen);
2020 
2021     qemu_iovec_reset(&ctx->mdata.iov);
2022     qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2023 
2024     req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2025                                 nvme_verify_cb, ctx);
2026     return;
2027 
2028 out:
2029     nvme_verify_cb(ctx, ret);
2030 }
2031 
2032 struct nvme_compare_ctx {
2033     struct {
2034         QEMUIOVector iov;
2035         uint8_t *bounce;
2036     } data;
2037 
2038     struct {
2039         QEMUIOVector iov;
2040         uint8_t *bounce;
2041     } mdata;
2042 };
2043 
2044 static void nvme_compare_mdata_cb(void *opaque, int ret)
2045 {
2046     NvmeRequest *req = opaque;
2047     NvmeNamespace *ns = req->ns;
2048     NvmeCtrl *n = nvme_ctrl(req);
2049     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2050     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2051     uint16_t apptag = le16_to_cpu(rw->apptag);
2052     uint16_t appmask = le16_to_cpu(rw->appmask);
2053     uint32_t reftag = le32_to_cpu(rw->reftag);
2054     struct nvme_compare_ctx *ctx = req->opaque;
2055     g_autofree uint8_t *buf = NULL;
2056     BlockBackend *blk = ns->blkconf.blk;
2057     BlockAcctCookie *acct = &req->acct;
2058     BlockAcctStats *stats = blk_get_stats(blk);
2059     uint16_t status = NVME_SUCCESS;
2060 
2061     trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2062 
2063     if (ret) {
2064         block_acct_failed(stats, acct);
2065         nvme_aio_err(req, ret);
2066         goto out;
2067     }
2068 
2069     buf = g_malloc(ctx->mdata.iov.size);
2070 
2071     status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2072                                NVME_TX_DIRECTION_TO_DEVICE, req);
2073     if (status) {
2074         req->status = status;
2075         goto out;
2076     }
2077 
2078     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2079         uint64_t slba = le64_to_cpu(rw->slba);
2080         uint8_t *bufp;
2081         uint8_t *mbufp = ctx->mdata.bounce;
2082         uint8_t *end = mbufp + ctx->mdata.iov.size;
2083         int16_t pil = 0;
2084 
2085         status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2086                                 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2087                                 slba, apptag, appmask, &reftag);
2088         if (status) {
2089             req->status = status;
2090             goto out;
2091         }
2092 
2093         /*
2094          * When formatted with protection information, do not compare the DIF
2095          * tuple.
2096          */
2097         if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2098             pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
2099         }
2100 
2101         for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2102             if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2103                 req->status = NVME_CMP_FAILURE;
2104                 goto out;
2105             }
2106         }
2107 
2108         goto out;
2109     }
2110 
2111     if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2112         req->status = NVME_CMP_FAILURE;
2113         goto out;
2114     }
2115 
2116     block_acct_done(stats, acct);
2117 
2118 out:
2119     qemu_iovec_destroy(&ctx->data.iov);
2120     g_free(ctx->data.bounce);
2121 
2122     qemu_iovec_destroy(&ctx->mdata.iov);
2123     g_free(ctx->mdata.bounce);
2124 
2125     g_free(ctx);
2126 
2127     nvme_enqueue_req_completion(nvme_cq(req), req);
2128 }
2129 
2130 static void nvme_compare_data_cb(void *opaque, int ret)
2131 {
2132     NvmeRequest *req = opaque;
2133     NvmeCtrl *n = nvme_ctrl(req);
2134     NvmeNamespace *ns = req->ns;
2135     BlockBackend *blk = ns->blkconf.blk;
2136     BlockAcctCookie *acct = &req->acct;
2137     BlockAcctStats *stats = blk_get_stats(blk);
2138 
2139     struct nvme_compare_ctx *ctx = req->opaque;
2140     g_autofree uint8_t *buf = NULL;
2141     uint16_t status;
2142 
2143     trace_pci_nvme_compare_data_cb(nvme_cid(req));
2144 
2145     if (ret) {
2146         block_acct_failed(stats, acct);
2147         nvme_aio_err(req, ret);
2148         goto out;
2149     }
2150 
2151     buf = g_malloc(ctx->data.iov.size);
2152 
2153     status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2154                               NVME_TX_DIRECTION_TO_DEVICE, req);
2155     if (status) {
2156         req->status = status;
2157         goto out;
2158     }
2159 
2160     if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2161         req->status = NVME_CMP_FAILURE;
2162         goto out;
2163     }
2164 
2165     if (ns->lbaf.ms) {
2166         NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2167         uint64_t slba = le64_to_cpu(rw->slba);
2168         uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2169         size_t mlen = nvme_m2b(ns, nlb);
2170         uint64_t offset = nvme_moff(ns, slba);
2171 
2172         ctx->mdata.bounce = g_malloc(mlen);
2173 
2174         qemu_iovec_init(&ctx->mdata.iov, 1);
2175         qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2176 
2177         req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2178                                     nvme_compare_mdata_cb, req);
2179         return;
2180     }
2181 
2182     block_acct_done(stats, acct);
2183 
2184 out:
2185     qemu_iovec_destroy(&ctx->data.iov);
2186     g_free(ctx->data.bounce);
2187     g_free(ctx);
2188 
2189     nvme_enqueue_req_completion(nvme_cq(req), req);
2190 }
2191 
2192 typedef struct NvmeDSMAIOCB {
2193     BlockAIOCB common;
2194     BlockAIOCB *aiocb;
2195     NvmeRequest *req;
2196     QEMUBH *bh;
2197     int ret;
2198 
2199     NvmeDsmRange *range;
2200     unsigned int nr;
2201     unsigned int idx;
2202 } NvmeDSMAIOCB;
2203 
2204 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2205 {
2206     NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2207 
2208     /* break nvme_dsm_cb loop */
2209     iocb->idx = iocb->nr;
2210     iocb->ret = -ECANCELED;
2211 
2212     if (iocb->aiocb) {
2213         blk_aio_cancel_async(iocb->aiocb);
2214         iocb->aiocb = NULL;
2215     } else {
2216         /*
2217          * We only reach this if nvme_dsm_cancel() has already been called or
2218          * the command ran to completion and nvme_dsm_bh is scheduled to run.
2219          */
2220         assert(iocb->idx == iocb->nr);
2221     }
2222 }
2223 
2224 static const AIOCBInfo nvme_dsm_aiocb_info = {
2225     .aiocb_size   = sizeof(NvmeDSMAIOCB),
2226     .cancel_async = nvme_dsm_cancel,
2227 };
2228 
2229 static void nvme_dsm_bh(void *opaque)
2230 {
2231     NvmeDSMAIOCB *iocb = opaque;
2232 
2233     iocb->common.cb(iocb->common.opaque, iocb->ret);
2234 
2235     qemu_bh_delete(iocb->bh);
2236     iocb->bh = NULL;
2237     qemu_aio_unref(iocb);
2238 }
2239 
2240 static void nvme_dsm_cb(void *opaque, int ret);
2241 
2242 static void nvme_dsm_md_cb(void *opaque, int ret)
2243 {
2244     NvmeDSMAIOCB *iocb = opaque;
2245     NvmeRequest *req = iocb->req;
2246     NvmeNamespace *ns = req->ns;
2247     NvmeDsmRange *range;
2248     uint64_t slba;
2249     uint32_t nlb;
2250 
2251     if (ret < 0) {
2252         iocb->ret = ret;
2253         goto done;
2254     }
2255 
2256     if (!ns->lbaf.ms) {
2257         nvme_dsm_cb(iocb, 0);
2258         return;
2259     }
2260 
2261     range = &iocb->range[iocb->idx - 1];
2262     slba = le64_to_cpu(range->slba);
2263     nlb = le32_to_cpu(range->nlb);
2264 
2265     /*
2266      * Check that all block were discarded (zeroed); otherwise we do not zero
2267      * the metadata.
2268      */
2269 
2270     ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2271     if (ret) {
2272         if (ret < 0) {
2273             iocb->ret = ret;
2274             goto done;
2275         }
2276 
2277         nvme_dsm_cb(iocb, 0);
2278     }
2279 
2280     iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2281                                         nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2282                                         nvme_dsm_cb, iocb);
2283     return;
2284 
2285 done:
2286     iocb->aiocb = NULL;
2287     qemu_bh_schedule(iocb->bh);
2288 }
2289 
2290 static void nvme_dsm_cb(void *opaque, int ret)
2291 {
2292     NvmeDSMAIOCB *iocb = opaque;
2293     NvmeRequest *req = iocb->req;
2294     NvmeCtrl *n = nvme_ctrl(req);
2295     NvmeNamespace *ns = req->ns;
2296     NvmeDsmRange *range;
2297     uint64_t slba;
2298     uint32_t nlb;
2299 
2300     if (ret < 0) {
2301         iocb->ret = ret;
2302         goto done;
2303     }
2304 
2305 next:
2306     if (iocb->idx == iocb->nr) {
2307         goto done;
2308     }
2309 
2310     range = &iocb->range[iocb->idx++];
2311     slba = le64_to_cpu(range->slba);
2312     nlb = le32_to_cpu(range->nlb);
2313 
2314     trace_pci_nvme_dsm_deallocate(slba, nlb);
2315 
2316     if (nlb > n->dmrsl) {
2317         trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2318         goto next;
2319     }
2320 
2321     if (nvme_check_bounds(ns, slba, nlb)) {
2322         trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2323                                              ns->id_ns.nsze);
2324         goto next;
2325     }
2326 
2327     iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2328                                    nvme_l2b(ns, nlb),
2329                                    nvme_dsm_md_cb, iocb);
2330     return;
2331 
2332 done:
2333     iocb->aiocb = NULL;
2334     qemu_bh_schedule(iocb->bh);
2335 }
2336 
2337 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2338 {
2339     NvmeNamespace *ns = req->ns;
2340     NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2341     uint32_t attr = le32_to_cpu(dsm->attributes);
2342     uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2343     uint16_t status = NVME_SUCCESS;
2344 
2345     trace_pci_nvme_dsm(nr, attr);
2346 
2347     if (attr & NVME_DSMGMT_AD) {
2348         NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2349                                          nvme_misc_cb, req);
2350 
2351         iocb->req = req;
2352         iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
2353         iocb->ret = 0;
2354         iocb->range = g_new(NvmeDsmRange, nr);
2355         iocb->nr = nr;
2356         iocb->idx = 0;
2357 
2358         status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2359                           req);
2360         if (status) {
2361             return status;
2362         }
2363 
2364         req->aiocb = &iocb->common;
2365         nvme_dsm_cb(iocb, 0);
2366 
2367         return NVME_NO_COMPLETE;
2368     }
2369 
2370     return status;
2371 }
2372 
2373 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2374 {
2375     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2376     NvmeNamespace *ns = req->ns;
2377     BlockBackend *blk = ns->blkconf.blk;
2378     uint64_t slba = le64_to_cpu(rw->slba);
2379     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2380     size_t len = nvme_l2b(ns, nlb);
2381     int64_t offset = nvme_l2b(ns, slba);
2382     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2383     uint32_t reftag = le32_to_cpu(rw->reftag);
2384     NvmeBounceContext *ctx = NULL;
2385     uint16_t status;
2386 
2387     trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2388 
2389     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2390         status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2391         if (status) {
2392             return status;
2393         }
2394 
2395         if (prinfo & NVME_PRINFO_PRACT) {
2396             return NVME_INVALID_PROT_INFO | NVME_DNR;
2397         }
2398     }
2399 
2400     if (len > n->page_size << n->params.vsl) {
2401         return NVME_INVALID_FIELD | NVME_DNR;
2402     }
2403 
2404     status = nvme_check_bounds(ns, slba, nlb);
2405     if (status) {
2406         return status;
2407     }
2408 
2409     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2410         status = nvme_check_dulbe(ns, slba, nlb);
2411         if (status) {
2412             return status;
2413         }
2414     }
2415 
2416     ctx = g_new0(NvmeBounceContext, 1);
2417     ctx->req = req;
2418 
2419     ctx->data.bounce = g_malloc(len);
2420 
2421     qemu_iovec_init(&ctx->data.iov, 1);
2422     qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2423 
2424     block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2425                      BLOCK_ACCT_READ);
2426 
2427     req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2428                                 nvme_verify_mdata_in_cb, ctx);
2429     return NVME_NO_COMPLETE;
2430 }
2431 
2432 typedef struct NvmeCopyAIOCB {
2433     BlockAIOCB common;
2434     BlockAIOCB *aiocb;
2435     NvmeRequest *req;
2436     QEMUBH *bh;
2437     int ret;
2438 
2439     NvmeCopySourceRange *ranges;
2440     int nr;
2441     int idx;
2442 
2443     uint8_t *bounce;
2444     QEMUIOVector iov;
2445     struct {
2446         BlockAcctCookie read;
2447         BlockAcctCookie write;
2448     } acct;
2449 
2450     uint32_t reftag;
2451     uint64_t slba;
2452 
2453     NvmeZone *zone;
2454 } NvmeCopyAIOCB;
2455 
2456 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2457 {
2458     NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2459 
2460     iocb->ret = -ECANCELED;
2461 
2462     if (iocb->aiocb) {
2463         blk_aio_cancel_async(iocb->aiocb);
2464         iocb->aiocb = NULL;
2465     }
2466 }
2467 
2468 static const AIOCBInfo nvme_copy_aiocb_info = {
2469     .aiocb_size   = sizeof(NvmeCopyAIOCB),
2470     .cancel_async = nvme_copy_cancel,
2471 };
2472 
2473 static void nvme_copy_bh(void *opaque)
2474 {
2475     NvmeCopyAIOCB *iocb = opaque;
2476     NvmeRequest *req = iocb->req;
2477     NvmeNamespace *ns = req->ns;
2478     BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2479 
2480     if (iocb->idx != iocb->nr) {
2481         req->cqe.result = cpu_to_le32(iocb->idx);
2482     }
2483 
2484     qemu_iovec_destroy(&iocb->iov);
2485     g_free(iocb->bounce);
2486 
2487     qemu_bh_delete(iocb->bh);
2488     iocb->bh = NULL;
2489 
2490     if (iocb->ret < 0) {
2491         block_acct_failed(stats, &iocb->acct.read);
2492         block_acct_failed(stats, &iocb->acct.write);
2493     } else {
2494         block_acct_done(stats, &iocb->acct.read);
2495         block_acct_done(stats, &iocb->acct.write);
2496     }
2497 
2498     iocb->common.cb(iocb->common.opaque, iocb->ret);
2499     qemu_aio_unref(iocb);
2500 }
2501 
2502 static void nvme_copy_cb(void *opaque, int ret);
2503 
2504 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2505 {
2506     NvmeCopyAIOCB *iocb = opaque;
2507     NvmeRequest *req = iocb->req;
2508     NvmeNamespace *ns = req->ns;
2509     NvmeCopySourceRange *range = &iocb->ranges[iocb->idx];
2510     uint32_t nlb = le32_to_cpu(range->nlb) + 1;
2511 
2512     if (ret < 0) {
2513         iocb->ret = ret;
2514         goto out;
2515     } else if (iocb->ret < 0) {
2516         goto out;
2517     }
2518 
2519     if (ns->params.zoned) {
2520         nvme_advance_zone_wp(ns, iocb->zone, nlb);
2521     }
2522 
2523     iocb->idx++;
2524     iocb->slba += nlb;
2525 out:
2526     nvme_copy_cb(iocb, iocb->ret);
2527 }
2528 
2529 static void nvme_copy_out_cb(void *opaque, int ret)
2530 {
2531     NvmeCopyAIOCB *iocb = opaque;
2532     NvmeRequest *req = iocb->req;
2533     NvmeNamespace *ns = req->ns;
2534     NvmeCopySourceRange *range;
2535     uint32_t nlb;
2536     size_t mlen;
2537     uint8_t *mbounce;
2538 
2539     if (ret < 0) {
2540         iocb->ret = ret;
2541         goto out;
2542     } else if (iocb->ret < 0) {
2543         goto out;
2544     }
2545 
2546     if (!ns->lbaf.ms) {
2547         nvme_copy_out_completed_cb(iocb, 0);
2548         return;
2549     }
2550 
2551     range = &iocb->ranges[iocb->idx];
2552     nlb = le32_to_cpu(range->nlb) + 1;
2553 
2554     mlen = nvme_m2b(ns, nlb);
2555     mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2556 
2557     qemu_iovec_reset(&iocb->iov);
2558     qemu_iovec_add(&iocb->iov, mbounce, mlen);
2559 
2560     iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
2561                                   &iocb->iov, 0, nvme_copy_out_completed_cb,
2562                                   iocb);
2563 
2564     return;
2565 
2566 out:
2567     nvme_copy_cb(iocb, ret);
2568 }
2569 
2570 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2571 {
2572     NvmeCopyAIOCB *iocb = opaque;
2573     NvmeRequest *req = iocb->req;
2574     NvmeNamespace *ns = req->ns;
2575     NvmeCopySourceRange *range;
2576     uint32_t nlb;
2577     size_t len;
2578     uint16_t status;
2579 
2580     if (ret < 0) {
2581         iocb->ret = ret;
2582         goto out;
2583     } else if (iocb->ret < 0) {
2584         goto out;
2585     }
2586 
2587     range = &iocb->ranges[iocb->idx];
2588     nlb = le32_to_cpu(range->nlb) + 1;
2589     len = nvme_l2b(ns, nlb);
2590 
2591     trace_pci_nvme_copy_out(iocb->slba, nlb);
2592 
2593     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2594         NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2595 
2596         uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2597         uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2598 
2599         uint16_t apptag = le16_to_cpu(range->apptag);
2600         uint16_t appmask = le16_to_cpu(range->appmask);
2601         uint32_t reftag = le32_to_cpu(range->reftag);
2602 
2603         uint64_t slba = le64_to_cpu(range->slba);
2604         size_t mlen = nvme_m2b(ns, nlb);
2605         uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2606 
2607         status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
2608                                 slba, apptag, appmask, &reftag);
2609         if (status) {
2610             goto invalid;
2611         }
2612 
2613         apptag = le16_to_cpu(copy->apptag);
2614         appmask = le16_to_cpu(copy->appmask);
2615 
2616         if (prinfow & NVME_PRINFO_PRACT) {
2617             status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
2618             if (status) {
2619                 goto invalid;
2620             }
2621 
2622             nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
2623                                         apptag, &iocb->reftag);
2624         } else {
2625             status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
2626                                     prinfow, iocb->slba, apptag, appmask,
2627                                     &iocb->reftag);
2628             if (status) {
2629                 goto invalid;
2630             }
2631         }
2632     }
2633 
2634     status = nvme_check_bounds(ns, iocb->slba, nlb);
2635     if (status) {
2636         goto invalid;
2637     }
2638 
2639     if (ns->params.zoned) {
2640         status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
2641         if (status) {
2642             goto invalid;
2643         }
2644 
2645         iocb->zone->w_ptr += nlb;
2646     }
2647 
2648     qemu_iovec_reset(&iocb->iov);
2649     qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2650 
2651     iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
2652                                   &iocb->iov, 0, nvme_copy_out_cb, iocb);
2653 
2654     return;
2655 
2656 invalid:
2657     req->status = status;
2658     iocb->aiocb = NULL;
2659     if (iocb->bh) {
2660         qemu_bh_schedule(iocb->bh);
2661     }
2662 
2663     return;
2664 
2665 out:
2666     nvme_copy_cb(iocb, ret);
2667 }
2668 
2669 static void nvme_copy_in_cb(void *opaque, int ret)
2670 {
2671     NvmeCopyAIOCB *iocb = opaque;
2672     NvmeRequest *req = iocb->req;
2673     NvmeNamespace *ns = req->ns;
2674     NvmeCopySourceRange *range;
2675     uint64_t slba;
2676     uint32_t nlb;
2677 
2678     if (ret < 0) {
2679         iocb->ret = ret;
2680         goto out;
2681     } else if (iocb->ret < 0) {
2682         goto out;
2683     }
2684 
2685     if (!ns->lbaf.ms) {
2686         nvme_copy_in_completed_cb(iocb, 0);
2687         return;
2688     }
2689 
2690     range = &iocb->ranges[iocb->idx];
2691     slba = le64_to_cpu(range->slba);
2692     nlb = le32_to_cpu(range->nlb) + 1;
2693 
2694     qemu_iovec_reset(&iocb->iov);
2695     qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
2696                    nvme_m2b(ns, nlb));
2697 
2698     iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
2699                                  &iocb->iov, 0, nvme_copy_in_completed_cb,
2700                                  iocb);
2701     return;
2702 
2703 out:
2704     nvme_copy_cb(iocb, iocb->ret);
2705 }
2706 
2707 static void nvme_copy_cb(void *opaque, int ret)
2708 {
2709     NvmeCopyAIOCB *iocb = opaque;
2710     NvmeRequest *req = iocb->req;
2711     NvmeNamespace *ns = req->ns;
2712     NvmeCopySourceRange *range;
2713     uint64_t slba;
2714     uint32_t nlb;
2715     size_t len;
2716     uint16_t status;
2717 
2718     if (ret < 0) {
2719         iocb->ret = ret;
2720         goto done;
2721     } else if (iocb->ret < 0) {
2722         goto done;
2723     }
2724 
2725     if (iocb->idx == iocb->nr) {
2726         goto done;
2727     }
2728 
2729     range = &iocb->ranges[iocb->idx];
2730     slba = le64_to_cpu(range->slba);
2731     nlb = le32_to_cpu(range->nlb) + 1;
2732     len = nvme_l2b(ns, nlb);
2733 
2734     trace_pci_nvme_copy_source_range(slba, nlb);
2735 
2736     if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2737         status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2738         goto invalid;
2739     }
2740 
2741     status = nvme_check_bounds(ns, slba, nlb);
2742     if (status) {
2743         goto invalid;
2744     }
2745 
2746     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2747         status = nvme_check_dulbe(ns, slba, nlb);
2748         if (status) {
2749             goto invalid;
2750         }
2751     }
2752 
2753     if (ns->params.zoned) {
2754         status = nvme_check_zone_read(ns, slba, nlb);
2755         if (status) {
2756             goto invalid;
2757         }
2758     }
2759 
2760     qemu_iovec_reset(&iocb->iov);
2761     qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2762 
2763     iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
2764                                  &iocb->iov, 0, nvme_copy_in_cb, iocb);
2765     return;
2766 
2767 invalid:
2768     req->status = status;
2769 done:
2770     iocb->aiocb = NULL;
2771     if (iocb->bh) {
2772         qemu_bh_schedule(iocb->bh);
2773     }
2774 }
2775 
2776 
2777 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2778 {
2779     NvmeNamespace *ns = req->ns;
2780     NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2781     NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
2782                                       nvme_misc_cb, req);
2783     uint16_t nr = copy->nr + 1;
2784     uint8_t format = copy->control[0] & 0xf;
2785     uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2786     uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2787 
2788     uint16_t status;
2789 
2790     trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2791 
2792     iocb->ranges = NULL;
2793     iocb->zone = NULL;
2794 
2795     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2796         ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
2797         status = NVME_INVALID_FIELD | NVME_DNR;
2798         goto invalid;
2799     }
2800 
2801     if (!(n->id_ctrl.ocfs & (1 << format))) {
2802         trace_pci_nvme_err_copy_invalid_format(format);
2803         status = NVME_INVALID_FIELD | NVME_DNR;
2804         goto invalid;
2805     }
2806 
2807     if (nr > ns->id_ns.msrc + 1) {
2808         status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2809         goto invalid;
2810     }
2811 
2812     iocb->ranges = g_new(NvmeCopySourceRange, nr);
2813 
2814     status = nvme_h2c(n, (uint8_t *)iocb->ranges,
2815                       sizeof(NvmeCopySourceRange) * nr, req);
2816     if (status) {
2817         goto invalid;
2818     }
2819 
2820     iocb->slba = le64_to_cpu(copy->sdlba);
2821 
2822     if (ns->params.zoned) {
2823         iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
2824         if (!iocb->zone) {
2825             status = NVME_LBA_RANGE | NVME_DNR;
2826             goto invalid;
2827         }
2828 
2829         status = nvme_zrm_auto(n, ns, iocb->zone);
2830         if (status) {
2831             goto invalid;
2832         }
2833     }
2834 
2835     iocb->req = req;
2836     iocb->bh = qemu_bh_new(nvme_copy_bh, iocb);
2837     iocb->ret = 0;
2838     iocb->nr = nr;
2839     iocb->idx = 0;
2840     iocb->reftag = le32_to_cpu(copy->reftag);
2841     iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
2842                               ns->lbasz + ns->lbaf.ms);
2843 
2844     qemu_iovec_init(&iocb->iov, 1);
2845 
2846     block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
2847                      BLOCK_ACCT_READ);
2848     block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
2849                      BLOCK_ACCT_WRITE);
2850 
2851     req->aiocb = &iocb->common;
2852     nvme_copy_cb(iocb, 0);
2853 
2854     return NVME_NO_COMPLETE;
2855 
2856 invalid:
2857     g_free(iocb->ranges);
2858     qemu_aio_unref(iocb);
2859     return status;
2860 }
2861 
2862 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
2863 {
2864     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2865     NvmeNamespace *ns = req->ns;
2866     BlockBackend *blk = ns->blkconf.blk;
2867     uint64_t slba = le64_to_cpu(rw->slba);
2868     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2869     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2870     size_t data_len = nvme_l2b(ns, nlb);
2871     size_t len = data_len;
2872     int64_t offset = nvme_l2b(ns, slba);
2873     struct nvme_compare_ctx *ctx = NULL;
2874     uint16_t status;
2875 
2876     trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2877 
2878     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
2879         return NVME_INVALID_PROT_INFO | NVME_DNR;
2880     }
2881 
2882     if (nvme_ns_ext(ns)) {
2883         len += nvme_m2b(ns, nlb);
2884     }
2885 
2886     status = nvme_check_mdts(n, len);
2887     if (status) {
2888         return status;
2889     }
2890 
2891     status = nvme_check_bounds(ns, slba, nlb);
2892     if (status) {
2893         return status;
2894     }
2895 
2896     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2897         status = nvme_check_dulbe(ns, slba, nlb);
2898         if (status) {
2899             return status;
2900         }
2901     }
2902 
2903     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
2904     if (status) {
2905         return status;
2906     }
2907 
2908     ctx = g_new(struct nvme_compare_ctx, 1);
2909     ctx->data.bounce = g_malloc(data_len);
2910 
2911     req->opaque = ctx;
2912 
2913     qemu_iovec_init(&ctx->data.iov, 1);
2914     qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
2915 
2916     block_acct_start(blk_get_stats(blk), &req->acct, data_len,
2917                      BLOCK_ACCT_READ);
2918     req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
2919                                 nvme_compare_data_cb, req);
2920 
2921     return NVME_NO_COMPLETE;
2922 }
2923 
2924 typedef struct NvmeFlushAIOCB {
2925     BlockAIOCB common;
2926     BlockAIOCB *aiocb;
2927     NvmeRequest *req;
2928     QEMUBH *bh;
2929     int ret;
2930 
2931     NvmeNamespace *ns;
2932     uint32_t nsid;
2933     bool broadcast;
2934 } NvmeFlushAIOCB;
2935 
2936 static void nvme_flush_cancel(BlockAIOCB *acb)
2937 {
2938     NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
2939 
2940     iocb->ret = -ECANCELED;
2941 
2942     if (iocb->aiocb) {
2943         blk_aio_cancel_async(iocb->aiocb);
2944     }
2945 }
2946 
2947 static const AIOCBInfo nvme_flush_aiocb_info = {
2948     .aiocb_size = sizeof(NvmeFlushAIOCB),
2949     .cancel_async = nvme_flush_cancel,
2950     .get_aio_context = nvme_get_aio_context,
2951 };
2952 
2953 static void nvme_flush_ns_cb(void *opaque, int ret)
2954 {
2955     NvmeFlushAIOCB *iocb = opaque;
2956     NvmeNamespace *ns = iocb->ns;
2957 
2958     if (ret < 0) {
2959         iocb->ret = ret;
2960         goto out;
2961     } else if (iocb->ret < 0) {
2962         goto out;
2963     }
2964 
2965     if (ns) {
2966         trace_pci_nvme_flush_ns(iocb->nsid);
2967 
2968         iocb->ns = NULL;
2969         iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
2970         return;
2971     }
2972 
2973 out:
2974     iocb->aiocb = NULL;
2975     qemu_bh_schedule(iocb->bh);
2976 }
2977 
2978 static void nvme_flush_bh(void *opaque)
2979 {
2980     NvmeFlushAIOCB *iocb = opaque;
2981     NvmeRequest *req = iocb->req;
2982     NvmeCtrl *n = nvme_ctrl(req);
2983     int i;
2984 
2985     if (iocb->ret < 0) {
2986         goto done;
2987     }
2988 
2989     if (iocb->broadcast) {
2990         for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
2991             iocb->ns = nvme_ns(n, i);
2992             if (iocb->ns) {
2993                 iocb->nsid = i;
2994                 break;
2995             }
2996         }
2997     }
2998 
2999     if (!iocb->ns) {
3000         goto done;
3001     }
3002 
3003     nvme_flush_ns_cb(iocb, 0);
3004     return;
3005 
3006 done:
3007     qemu_bh_delete(iocb->bh);
3008     iocb->bh = NULL;
3009 
3010     iocb->common.cb(iocb->common.opaque, iocb->ret);
3011 
3012     qemu_aio_unref(iocb);
3013 
3014     return;
3015 }
3016 
3017 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3018 {
3019     NvmeFlushAIOCB *iocb;
3020     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3021     uint16_t status;
3022 
3023     iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3024 
3025     iocb->req = req;
3026     iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
3027     iocb->ret = 0;
3028     iocb->ns = NULL;
3029     iocb->nsid = 0;
3030     iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3031 
3032     if (!iocb->broadcast) {
3033         if (!nvme_nsid_valid(n, nsid)) {
3034             status = NVME_INVALID_NSID | NVME_DNR;
3035             goto out;
3036         }
3037 
3038         iocb->ns = nvme_ns(n, nsid);
3039         if (!iocb->ns) {
3040             status = NVME_INVALID_FIELD | NVME_DNR;
3041             goto out;
3042         }
3043 
3044         iocb->nsid = nsid;
3045     }
3046 
3047     req->aiocb = &iocb->common;
3048     qemu_bh_schedule(iocb->bh);
3049 
3050     return NVME_NO_COMPLETE;
3051 
3052 out:
3053     qemu_bh_delete(iocb->bh);
3054     iocb->bh = NULL;
3055     qemu_aio_unref(iocb);
3056 
3057     return status;
3058 }
3059 
3060 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3061 {
3062     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3063     NvmeNamespace *ns = req->ns;
3064     uint64_t slba = le64_to_cpu(rw->slba);
3065     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3066     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3067     uint64_t data_size = nvme_l2b(ns, nlb);
3068     uint64_t mapped_size = data_size;
3069     uint64_t data_offset;
3070     BlockBackend *blk = ns->blkconf.blk;
3071     uint16_t status;
3072 
3073     if (nvme_ns_ext(ns)) {
3074         mapped_size += nvme_m2b(ns, nlb);
3075 
3076         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3077             bool pract = prinfo & NVME_PRINFO_PRACT;
3078 
3079             if (pract && ns->lbaf.ms == 8) {
3080                 mapped_size = data_size;
3081             }
3082         }
3083     }
3084 
3085     trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3086 
3087     status = nvme_check_mdts(n, mapped_size);
3088     if (status) {
3089         goto invalid;
3090     }
3091 
3092     status = nvme_check_bounds(ns, slba, nlb);
3093     if (status) {
3094         goto invalid;
3095     }
3096 
3097     if (ns->params.zoned) {
3098         status = nvme_check_zone_read(ns, slba, nlb);
3099         if (status) {
3100             trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3101             goto invalid;
3102         }
3103     }
3104 
3105     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3106         status = nvme_check_dulbe(ns, slba, nlb);
3107         if (status) {
3108             goto invalid;
3109         }
3110     }
3111 
3112     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3113         return nvme_dif_rw(n, req);
3114     }
3115 
3116     status = nvme_map_data(n, nlb, req);
3117     if (status) {
3118         goto invalid;
3119     }
3120 
3121     data_offset = nvme_l2b(ns, slba);
3122 
3123     block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3124                      BLOCK_ACCT_READ);
3125     nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
3126     return NVME_NO_COMPLETE;
3127 
3128 invalid:
3129     block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3130     return status | NVME_DNR;
3131 }
3132 
3133 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3134                               bool wrz)
3135 {
3136     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3137     NvmeNamespace *ns = req->ns;
3138     uint64_t slba = le64_to_cpu(rw->slba);
3139     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3140     uint16_t ctrl = le16_to_cpu(rw->control);
3141     uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3142     uint64_t data_size = nvme_l2b(ns, nlb);
3143     uint64_t mapped_size = data_size;
3144     uint64_t data_offset;
3145     NvmeZone *zone;
3146     NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3147     BlockBackend *blk = ns->blkconf.blk;
3148     uint16_t status;
3149 
3150     if (nvme_ns_ext(ns)) {
3151         mapped_size += nvme_m2b(ns, nlb);
3152 
3153         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3154             bool pract = prinfo & NVME_PRINFO_PRACT;
3155 
3156             if (pract && ns->lbaf.ms == 8) {
3157                 mapped_size -= nvme_m2b(ns, nlb);
3158             }
3159         }
3160     }
3161 
3162     trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3163                          nvme_nsid(ns), nlb, mapped_size, slba);
3164 
3165     if (!wrz) {
3166         status = nvme_check_mdts(n, mapped_size);
3167         if (status) {
3168             goto invalid;
3169         }
3170     }
3171 
3172     status = nvme_check_bounds(ns, slba, nlb);
3173     if (status) {
3174         goto invalid;
3175     }
3176 
3177     if (ns->params.zoned) {
3178         zone = nvme_get_zone_by_slba(ns, slba);
3179         assert(zone);
3180 
3181         if (append) {
3182             bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3183 
3184             if (unlikely(slba != zone->d.zslba)) {
3185                 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3186                 status = NVME_INVALID_FIELD;
3187                 goto invalid;
3188             }
3189 
3190             if (n->params.zasl &&
3191                 data_size > (uint64_t)n->page_size << n->params.zasl) {
3192                 trace_pci_nvme_err_zasl(data_size);
3193                 return NVME_INVALID_FIELD | NVME_DNR;
3194             }
3195 
3196             slba = zone->w_ptr;
3197             rw->slba = cpu_to_le64(slba);
3198             res->slba = cpu_to_le64(slba);
3199 
3200             switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3201             case NVME_ID_NS_DPS_TYPE_1:
3202                 if (!piremap) {
3203                     return NVME_INVALID_PROT_INFO | NVME_DNR;
3204                 }
3205 
3206                 /* fallthrough */
3207 
3208             case NVME_ID_NS_DPS_TYPE_2:
3209                 if (piremap) {
3210                     uint32_t reftag = le32_to_cpu(rw->reftag);
3211                     rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3212                 }
3213 
3214                 break;
3215 
3216             case NVME_ID_NS_DPS_TYPE_3:
3217                 if (piremap) {
3218                     return NVME_INVALID_PROT_INFO | NVME_DNR;
3219                 }
3220 
3221                 break;
3222             }
3223         }
3224 
3225         status = nvme_check_zone_write(ns, zone, slba, nlb);
3226         if (status) {
3227             goto invalid;
3228         }
3229 
3230         status = nvme_zrm_auto(n, ns, zone);
3231         if (status) {
3232             goto invalid;
3233         }
3234 
3235         zone->w_ptr += nlb;
3236     }
3237 
3238     data_offset = nvme_l2b(ns, slba);
3239 
3240     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3241         return nvme_dif_rw(n, req);
3242     }
3243 
3244     if (!wrz) {
3245         status = nvme_map_data(n, nlb, req);
3246         if (status) {
3247             goto invalid;
3248         }
3249 
3250         block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3251                          BLOCK_ACCT_WRITE);
3252         nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3253     } else {
3254         req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3255                                            BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3256                                            req);
3257     }
3258 
3259     return NVME_NO_COMPLETE;
3260 
3261 invalid:
3262     block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3263     return status | NVME_DNR;
3264 }
3265 
3266 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3267 {
3268     return nvme_do_write(n, req, false, false);
3269 }
3270 
3271 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3272 {
3273     return nvme_do_write(n, req, false, true);
3274 }
3275 
3276 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3277 {
3278     return nvme_do_write(n, req, true, false);
3279 }
3280 
3281 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3282                                             uint64_t *slba, uint32_t *zone_idx)
3283 {
3284     uint32_t dw10 = le32_to_cpu(c->cdw10);
3285     uint32_t dw11 = le32_to_cpu(c->cdw11);
3286 
3287     if (!ns->params.zoned) {
3288         trace_pci_nvme_err_invalid_opc(c->opcode);
3289         return NVME_INVALID_OPCODE | NVME_DNR;
3290     }
3291 
3292     *slba = ((uint64_t)dw11) << 32 | dw10;
3293     if (unlikely(*slba >= ns->id_ns.nsze)) {
3294         trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3295         *slba = 0;
3296         return NVME_LBA_RANGE | NVME_DNR;
3297     }
3298 
3299     *zone_idx = nvme_zone_idx(ns, *slba);
3300     assert(*zone_idx < ns->num_zones);
3301 
3302     return NVME_SUCCESS;
3303 }
3304 
3305 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3306                                  NvmeRequest *);
3307 
3308 enum NvmeZoneProcessingMask {
3309     NVME_PROC_CURRENT_ZONE    = 0,
3310     NVME_PROC_OPENED_ZONES    = 1 << 0,
3311     NVME_PROC_CLOSED_ZONES    = 1 << 1,
3312     NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3313     NVME_PROC_FULL_ZONES      = 1 << 3,
3314 };
3315 
3316 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3317                                NvmeZoneState state, NvmeRequest *req)
3318 {
3319     return nvme_zrm_open(nvme_ctrl(req), ns, zone);
3320 }
3321 
3322 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3323                                 NvmeZoneState state, NvmeRequest *req)
3324 {
3325     return nvme_zrm_close(ns, zone);
3326 }
3327 
3328 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3329                                  NvmeZoneState state, NvmeRequest *req)
3330 {
3331     return nvme_zrm_finish(ns, zone);
3332 }
3333 
3334 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3335                                   NvmeZoneState state, NvmeRequest *req)
3336 {
3337     switch (state) {
3338     case NVME_ZONE_STATE_READ_ONLY:
3339         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3340         /* fall through */
3341     case NVME_ZONE_STATE_OFFLINE:
3342         return NVME_SUCCESS;
3343     default:
3344         return NVME_ZONE_INVAL_TRANSITION;
3345     }
3346 }
3347 
3348 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3349 {
3350     uint16_t status;
3351     uint8_t state = nvme_get_zone_state(zone);
3352 
3353     if (state == NVME_ZONE_STATE_EMPTY) {
3354         status = nvme_aor_check(ns, 1, 0);
3355         if (status) {
3356             return status;
3357         }
3358         nvme_aor_inc_active(ns);
3359         zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3360         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3361         return NVME_SUCCESS;
3362     }
3363 
3364     return NVME_ZONE_INVAL_TRANSITION;
3365 }
3366 
3367 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3368                                     enum NvmeZoneProcessingMask proc_mask,
3369                                     op_handler_t op_hndlr, NvmeRequest *req)
3370 {
3371     uint16_t status = NVME_SUCCESS;
3372     NvmeZoneState zs = nvme_get_zone_state(zone);
3373     bool proc_zone;
3374 
3375     switch (zs) {
3376     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3377     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3378         proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3379         break;
3380     case NVME_ZONE_STATE_CLOSED:
3381         proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3382         break;
3383     case NVME_ZONE_STATE_READ_ONLY:
3384         proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3385         break;
3386     case NVME_ZONE_STATE_FULL:
3387         proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3388         break;
3389     default:
3390         proc_zone = false;
3391     }
3392 
3393     if (proc_zone) {
3394         status = op_hndlr(ns, zone, zs, req);
3395     }
3396 
3397     return status;
3398 }
3399 
3400 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3401                                 enum NvmeZoneProcessingMask proc_mask,
3402                                 op_handler_t op_hndlr, NvmeRequest *req)
3403 {
3404     NvmeZone *next;
3405     uint16_t status = NVME_SUCCESS;
3406     int i;
3407 
3408     if (!proc_mask) {
3409         status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3410     } else {
3411         if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3412             QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3413                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3414                                              req);
3415                 if (status && status != NVME_NO_COMPLETE) {
3416                     goto out;
3417                 }
3418             }
3419         }
3420         if (proc_mask & NVME_PROC_OPENED_ZONES) {
3421             QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3422                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3423                                              req);
3424                 if (status && status != NVME_NO_COMPLETE) {
3425                     goto out;
3426                 }
3427             }
3428 
3429             QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3430                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3431                                              req);
3432                 if (status && status != NVME_NO_COMPLETE) {
3433                     goto out;
3434                 }
3435             }
3436         }
3437         if (proc_mask & NVME_PROC_FULL_ZONES) {
3438             QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3439                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3440                                              req);
3441                 if (status && status != NVME_NO_COMPLETE) {
3442                     goto out;
3443                 }
3444             }
3445         }
3446 
3447         if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3448             for (i = 0; i < ns->num_zones; i++, zone++) {
3449                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3450                                              req);
3451                 if (status && status != NVME_NO_COMPLETE) {
3452                     goto out;
3453                 }
3454             }
3455         }
3456     }
3457 
3458 out:
3459     return status;
3460 }
3461 
3462 typedef struct NvmeZoneResetAIOCB {
3463     BlockAIOCB common;
3464     BlockAIOCB *aiocb;
3465     NvmeRequest *req;
3466     QEMUBH *bh;
3467     int ret;
3468 
3469     bool all;
3470     int idx;
3471     NvmeZone *zone;
3472 } NvmeZoneResetAIOCB;
3473 
3474 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
3475 {
3476     NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
3477     NvmeRequest *req = iocb->req;
3478     NvmeNamespace *ns = req->ns;
3479 
3480     iocb->idx = ns->num_zones;
3481 
3482     iocb->ret = -ECANCELED;
3483 
3484     if (iocb->aiocb) {
3485         blk_aio_cancel_async(iocb->aiocb);
3486         iocb->aiocb = NULL;
3487     }
3488 }
3489 
3490 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
3491     .aiocb_size = sizeof(NvmeZoneResetAIOCB),
3492     .cancel_async = nvme_zone_reset_cancel,
3493 };
3494 
3495 static void nvme_zone_reset_bh(void *opaque)
3496 {
3497     NvmeZoneResetAIOCB *iocb = opaque;
3498 
3499     iocb->common.cb(iocb->common.opaque, iocb->ret);
3500 
3501     qemu_bh_delete(iocb->bh);
3502     iocb->bh = NULL;
3503     qemu_aio_unref(iocb);
3504 }
3505 
3506 static void nvme_zone_reset_cb(void *opaque, int ret);
3507 
3508 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
3509 {
3510     NvmeZoneResetAIOCB *iocb = opaque;
3511     NvmeRequest *req = iocb->req;
3512     NvmeNamespace *ns = req->ns;
3513     int64_t moff;
3514     int count;
3515 
3516     if (ret < 0) {
3517         nvme_zone_reset_cb(iocb, ret);
3518         return;
3519     }
3520 
3521     if (!ns->lbaf.ms) {
3522         nvme_zone_reset_cb(iocb, 0);
3523         return;
3524     }
3525 
3526     moff = nvme_moff(ns, iocb->zone->d.zslba);
3527     count = nvme_m2b(ns, ns->zone_size);
3528 
3529     iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
3530                                         BDRV_REQ_MAY_UNMAP,
3531                                         nvme_zone_reset_cb, iocb);
3532     return;
3533 }
3534 
3535 static void nvme_zone_reset_cb(void *opaque, int ret)
3536 {
3537     NvmeZoneResetAIOCB *iocb = opaque;
3538     NvmeRequest *req = iocb->req;
3539     NvmeNamespace *ns = req->ns;
3540 
3541     if (ret < 0) {
3542         iocb->ret = ret;
3543         goto done;
3544     }
3545 
3546     if (iocb->zone) {
3547         nvme_zrm_reset(ns, iocb->zone);
3548 
3549         if (!iocb->all) {
3550             goto done;
3551         }
3552     }
3553 
3554     while (iocb->idx < ns->num_zones) {
3555         NvmeZone *zone = &ns->zone_array[iocb->idx++];
3556 
3557         switch (nvme_get_zone_state(zone)) {
3558         case NVME_ZONE_STATE_EMPTY:
3559             if (!iocb->all) {
3560                 goto done;
3561             }
3562 
3563             continue;
3564 
3565         case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3566         case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3567         case NVME_ZONE_STATE_CLOSED:
3568         case NVME_ZONE_STATE_FULL:
3569             iocb->zone = zone;
3570             break;
3571 
3572         default:
3573             continue;
3574         }
3575 
3576         trace_pci_nvme_zns_zone_reset(zone->d.zslba);
3577 
3578         iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
3579                                             nvme_l2b(ns, zone->d.zslba),
3580                                             nvme_l2b(ns, ns->zone_size),
3581                                             BDRV_REQ_MAY_UNMAP,
3582                                             nvme_zone_reset_epilogue_cb,
3583                                             iocb);
3584         return;
3585     }
3586 
3587 done:
3588     iocb->aiocb = NULL;
3589     if (iocb->bh) {
3590         qemu_bh_schedule(iocb->bh);
3591     }
3592 }
3593 
3594 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3595 {
3596     NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3597     NvmeNamespace *ns = req->ns;
3598     NvmeZone *zone;
3599     NvmeZoneResetAIOCB *iocb;
3600     uint8_t *zd_ext;
3601     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3602     uint64_t slba = 0;
3603     uint32_t zone_idx = 0;
3604     uint16_t status;
3605     uint8_t action;
3606     bool all;
3607     enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3608 
3609     action = dw13 & 0xff;
3610     all = !!(dw13 & 0x100);
3611 
3612     req->status = NVME_SUCCESS;
3613 
3614     if (!all) {
3615         status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3616         if (status) {
3617             return status;
3618         }
3619     }
3620 
3621     zone = &ns->zone_array[zone_idx];
3622     if (slba != zone->d.zslba) {
3623         trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3624         return NVME_INVALID_FIELD | NVME_DNR;
3625     }
3626 
3627     switch (action) {
3628 
3629     case NVME_ZONE_ACTION_OPEN:
3630         if (all) {
3631             proc_mask = NVME_PROC_CLOSED_ZONES;
3632         }
3633         trace_pci_nvme_open_zone(slba, zone_idx, all);
3634         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3635         break;
3636 
3637     case NVME_ZONE_ACTION_CLOSE:
3638         if (all) {
3639             proc_mask = NVME_PROC_OPENED_ZONES;
3640         }
3641         trace_pci_nvme_close_zone(slba, zone_idx, all);
3642         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3643         break;
3644 
3645     case NVME_ZONE_ACTION_FINISH:
3646         if (all) {
3647             proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3648         }
3649         trace_pci_nvme_finish_zone(slba, zone_idx, all);
3650         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3651         break;
3652 
3653     case NVME_ZONE_ACTION_RESET:
3654         trace_pci_nvme_reset_zone(slba, zone_idx, all);
3655 
3656         iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
3657                            nvme_misc_cb, req);
3658 
3659         iocb->req = req;
3660         iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb);
3661         iocb->ret = 0;
3662         iocb->all = all;
3663         iocb->idx = zone_idx;
3664         iocb->zone = NULL;
3665 
3666         req->aiocb = &iocb->common;
3667         nvme_zone_reset_cb(iocb, 0);
3668 
3669         return NVME_NO_COMPLETE;
3670 
3671     case NVME_ZONE_ACTION_OFFLINE:
3672         if (all) {
3673             proc_mask = NVME_PROC_READ_ONLY_ZONES;
3674         }
3675         trace_pci_nvme_offline_zone(slba, zone_idx, all);
3676         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3677         break;
3678 
3679     case NVME_ZONE_ACTION_SET_ZD_EXT:
3680         trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3681         if (all || !ns->params.zd_extension_size) {
3682             return NVME_INVALID_FIELD | NVME_DNR;
3683         }
3684         zd_ext = nvme_get_zd_extension(ns, zone_idx);
3685         status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3686         if (status) {
3687             trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3688             return status;
3689         }
3690 
3691         status = nvme_set_zd_ext(ns, zone);
3692         if (status == NVME_SUCCESS) {
3693             trace_pci_nvme_zd_extension_set(zone_idx);
3694             return status;
3695         }
3696         break;
3697 
3698     default:
3699         trace_pci_nvme_err_invalid_mgmt_action(action);
3700         status = NVME_INVALID_FIELD;
3701     }
3702 
3703     if (status == NVME_ZONE_INVAL_TRANSITION) {
3704         trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
3705                                                          zone->d.za);
3706     }
3707     if (status) {
3708         status |= NVME_DNR;
3709     }
3710 
3711     return status;
3712 }
3713 
3714 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
3715 {
3716     NvmeZoneState zs = nvme_get_zone_state(zl);
3717 
3718     switch (zafs) {
3719     case NVME_ZONE_REPORT_ALL:
3720         return true;
3721     case NVME_ZONE_REPORT_EMPTY:
3722         return zs == NVME_ZONE_STATE_EMPTY;
3723     case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
3724         return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
3725     case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
3726         return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
3727     case NVME_ZONE_REPORT_CLOSED:
3728         return zs == NVME_ZONE_STATE_CLOSED;
3729     case NVME_ZONE_REPORT_FULL:
3730         return zs == NVME_ZONE_STATE_FULL;
3731     case NVME_ZONE_REPORT_READ_ONLY:
3732         return zs == NVME_ZONE_STATE_READ_ONLY;
3733     case NVME_ZONE_REPORT_OFFLINE:
3734         return zs == NVME_ZONE_STATE_OFFLINE;
3735     default:
3736         return false;
3737     }
3738 }
3739 
3740 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
3741 {
3742     NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3743     NvmeNamespace *ns = req->ns;
3744     /* cdw12 is zero-based number of dwords to return. Convert to bytes */
3745     uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
3746     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3747     uint32_t zone_idx, zra, zrasf, partial;
3748     uint64_t max_zones, nr_zones = 0;
3749     uint16_t status;
3750     uint64_t slba;
3751     NvmeZoneDescr *z;
3752     NvmeZone *zone;
3753     NvmeZoneReportHeader *header;
3754     void *buf, *buf_p;
3755     size_t zone_entry_sz;
3756     int i;
3757 
3758     req->status = NVME_SUCCESS;
3759 
3760     status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3761     if (status) {
3762         return status;
3763     }
3764 
3765     zra = dw13 & 0xff;
3766     if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
3767         return NVME_INVALID_FIELD | NVME_DNR;
3768     }
3769     if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
3770         return NVME_INVALID_FIELD | NVME_DNR;
3771     }
3772 
3773     zrasf = (dw13 >> 8) & 0xff;
3774     if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
3775         return NVME_INVALID_FIELD | NVME_DNR;
3776     }
3777 
3778     if (data_size < sizeof(NvmeZoneReportHeader)) {
3779         return NVME_INVALID_FIELD | NVME_DNR;
3780     }
3781 
3782     status = nvme_check_mdts(n, data_size);
3783     if (status) {
3784         return status;
3785     }
3786 
3787     partial = (dw13 >> 16) & 0x01;
3788 
3789     zone_entry_sz = sizeof(NvmeZoneDescr);
3790     if (zra == NVME_ZONE_REPORT_EXTENDED) {
3791         zone_entry_sz += ns->params.zd_extension_size;
3792     }
3793 
3794     max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
3795     buf = g_malloc0(data_size);
3796 
3797     zone = &ns->zone_array[zone_idx];
3798     for (i = zone_idx; i < ns->num_zones; i++) {
3799         if (partial && nr_zones >= max_zones) {
3800             break;
3801         }
3802         if (nvme_zone_matches_filter(zrasf, zone++)) {
3803             nr_zones++;
3804         }
3805     }
3806     header = (NvmeZoneReportHeader *)buf;
3807     header->nr_zones = cpu_to_le64(nr_zones);
3808 
3809     buf_p = buf + sizeof(NvmeZoneReportHeader);
3810     for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
3811         zone = &ns->zone_array[zone_idx];
3812         if (nvme_zone_matches_filter(zrasf, zone)) {
3813             z = (NvmeZoneDescr *)buf_p;
3814             buf_p += sizeof(NvmeZoneDescr);
3815 
3816             z->zt = zone->d.zt;
3817             z->zs = zone->d.zs;
3818             z->zcap = cpu_to_le64(zone->d.zcap);
3819             z->zslba = cpu_to_le64(zone->d.zslba);
3820             z->za = zone->d.za;
3821 
3822             if (nvme_wp_is_valid(zone)) {
3823                 z->wp = cpu_to_le64(zone->d.wp);
3824             } else {
3825                 z->wp = cpu_to_le64(~0ULL);
3826             }
3827 
3828             if (zra == NVME_ZONE_REPORT_EXTENDED) {
3829                 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
3830                     memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
3831                            ns->params.zd_extension_size);
3832                 }
3833                 buf_p += ns->params.zd_extension_size;
3834             }
3835 
3836             max_zones--;
3837         }
3838     }
3839 
3840     status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
3841 
3842     g_free(buf);
3843 
3844     return status;
3845 }
3846 
3847 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
3848 {
3849     NvmeNamespace *ns;
3850     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3851 
3852     trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
3853                           req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
3854 
3855     if (!nvme_nsid_valid(n, nsid)) {
3856         return NVME_INVALID_NSID | NVME_DNR;
3857     }
3858 
3859     /*
3860      * In the base NVM command set, Flush may apply to all namespaces
3861      * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
3862      * along with TP 4056 (Namespace Types), it may be pretty screwed up.
3863      *
3864      * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
3865      * opcode with a specific command since we cannot determine a unique I/O
3866      * command set. Opcode 0h could have any other meaning than something
3867      * equivalent to flushing and say it DOES have completely different
3868      * semantics in some other command set - does an NSID of FFFFFFFFh then
3869      * mean "for all namespaces, apply whatever command set specific command
3870      * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
3871      * whatever command that uses the 0h opcode if, and only if, it allows NSID
3872      * to be FFFFFFFFh"?
3873      *
3874      * Anyway (and luckily), for now, we do not care about this since the
3875      * device only supports namespace types that includes the NVM Flush command
3876      * (NVM and Zoned), so always do an NVM Flush.
3877      */
3878     if (req->cmd.opcode == NVME_CMD_FLUSH) {
3879         return nvme_flush(n, req);
3880     }
3881 
3882     ns = nvme_ns(n, nsid);
3883     if (unlikely(!ns)) {
3884         return NVME_INVALID_FIELD | NVME_DNR;
3885     }
3886 
3887     if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
3888         trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
3889         return NVME_INVALID_OPCODE | NVME_DNR;
3890     }
3891 
3892     if (ns->status) {
3893         return ns->status;
3894     }
3895 
3896     req->ns = ns;
3897 
3898     switch (req->cmd.opcode) {
3899     case NVME_CMD_WRITE_ZEROES:
3900         return nvme_write_zeroes(n, req);
3901     case NVME_CMD_ZONE_APPEND:
3902         return nvme_zone_append(n, req);
3903     case NVME_CMD_WRITE:
3904         return nvme_write(n, req);
3905     case NVME_CMD_READ:
3906         return nvme_read(n, req);
3907     case NVME_CMD_COMPARE:
3908         return nvme_compare(n, req);
3909     case NVME_CMD_DSM:
3910         return nvme_dsm(n, req);
3911     case NVME_CMD_VERIFY:
3912         return nvme_verify(n, req);
3913     case NVME_CMD_COPY:
3914         return nvme_copy(n, req);
3915     case NVME_CMD_ZONE_MGMT_SEND:
3916         return nvme_zone_mgmt_send(n, req);
3917     case NVME_CMD_ZONE_MGMT_RECV:
3918         return nvme_zone_mgmt_recv(n, req);
3919     default:
3920         assert(false);
3921     }
3922 
3923     return NVME_INVALID_OPCODE | NVME_DNR;
3924 }
3925 
3926 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
3927 {
3928     n->sq[sq->sqid] = NULL;
3929     timer_free(sq->timer);
3930     g_free(sq->io_req);
3931     if (sq->sqid) {
3932         g_free(sq);
3933     }
3934 }
3935 
3936 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
3937 {
3938     NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
3939     NvmeRequest *r, *next;
3940     NvmeSQueue *sq;
3941     NvmeCQueue *cq;
3942     uint16_t qid = le16_to_cpu(c->qid);
3943 
3944     if (unlikely(!qid || nvme_check_sqid(n, qid))) {
3945         trace_pci_nvme_err_invalid_del_sq(qid);
3946         return NVME_INVALID_QID | NVME_DNR;
3947     }
3948 
3949     trace_pci_nvme_del_sq(qid);
3950 
3951     sq = n->sq[qid];
3952     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
3953         r = QTAILQ_FIRST(&sq->out_req_list);
3954         assert(r->aiocb);
3955         blk_aio_cancel(r->aiocb);
3956     }
3957 
3958     assert(QTAILQ_EMPTY(&sq->out_req_list));
3959 
3960     if (!nvme_check_cqid(n, sq->cqid)) {
3961         cq = n->cq[sq->cqid];
3962         QTAILQ_REMOVE(&cq->sq_list, sq, entry);
3963 
3964         nvme_post_cqes(cq);
3965         QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
3966             if (r->sq == sq) {
3967                 QTAILQ_REMOVE(&cq->req_list, r, entry);
3968                 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
3969             }
3970         }
3971     }
3972 
3973     nvme_free_sq(sq, n);
3974     return NVME_SUCCESS;
3975 }
3976 
3977 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
3978                          uint16_t sqid, uint16_t cqid, uint16_t size)
3979 {
3980     int i;
3981     NvmeCQueue *cq;
3982 
3983     sq->ctrl = n;
3984     sq->dma_addr = dma_addr;
3985     sq->sqid = sqid;
3986     sq->size = size;
3987     sq->cqid = cqid;
3988     sq->head = sq->tail = 0;
3989     sq->io_req = g_new0(NvmeRequest, sq->size);
3990 
3991     QTAILQ_INIT(&sq->req_list);
3992     QTAILQ_INIT(&sq->out_req_list);
3993     for (i = 0; i < sq->size; i++) {
3994         sq->io_req[i].sq = sq;
3995         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
3996     }
3997     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
3998 
3999     assert(n->cq[cqid]);
4000     cq = n->cq[cqid];
4001     QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4002     n->sq[sqid] = sq;
4003 }
4004 
4005 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4006 {
4007     NvmeSQueue *sq;
4008     NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4009 
4010     uint16_t cqid = le16_to_cpu(c->cqid);
4011     uint16_t sqid = le16_to_cpu(c->sqid);
4012     uint16_t qsize = le16_to_cpu(c->qsize);
4013     uint16_t qflags = le16_to_cpu(c->sq_flags);
4014     uint64_t prp1 = le64_to_cpu(c->prp1);
4015 
4016     trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4017 
4018     if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4019         trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4020         return NVME_INVALID_CQID | NVME_DNR;
4021     }
4022     if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
4023         n->sq[sqid] != NULL)) {
4024         trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4025         return NVME_INVALID_QID | NVME_DNR;
4026     }
4027     if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4028         trace_pci_nvme_err_invalid_create_sq_size(qsize);
4029         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4030     }
4031     if (unlikely(prp1 & (n->page_size - 1))) {
4032         trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4033         return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4034     }
4035     if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4036         trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4037         return NVME_INVALID_FIELD | NVME_DNR;
4038     }
4039     sq = g_malloc0(sizeof(*sq));
4040     nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4041     return NVME_SUCCESS;
4042 }
4043 
4044 struct nvme_stats {
4045     uint64_t units_read;
4046     uint64_t units_written;
4047     uint64_t read_commands;
4048     uint64_t write_commands;
4049 };
4050 
4051 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4052 {
4053     BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4054 
4055     stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
4056     stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
4057     stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4058     stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4059 }
4060 
4061 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4062                                 uint64_t off, NvmeRequest *req)
4063 {
4064     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4065     struct nvme_stats stats = { 0 };
4066     NvmeSmartLog smart = { 0 };
4067     uint32_t trans_len;
4068     NvmeNamespace *ns;
4069     time_t current_ms;
4070 
4071     if (off >= sizeof(smart)) {
4072         return NVME_INVALID_FIELD | NVME_DNR;
4073     }
4074 
4075     if (nsid != 0xffffffff) {
4076         ns = nvme_ns(n, nsid);
4077         if (!ns) {
4078             return NVME_INVALID_NSID | NVME_DNR;
4079         }
4080         nvme_set_blk_stats(ns, &stats);
4081     } else {
4082         int i;
4083 
4084         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4085             ns = nvme_ns(n, i);
4086             if (!ns) {
4087                 continue;
4088             }
4089             nvme_set_blk_stats(ns, &stats);
4090         }
4091     }
4092 
4093     trans_len = MIN(sizeof(smart) - off, buf_len);
4094     smart.critical_warning = n->smart_critical_warning;
4095 
4096     smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
4097                                                         1000));
4098     smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
4099                                                            1000));
4100     smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4101     smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4102 
4103     smart.temperature = cpu_to_le16(n->temperature);
4104 
4105     if ((n->temperature >= n->features.temp_thresh_hi) ||
4106         (n->temperature <= n->features.temp_thresh_low)) {
4107         smart.critical_warning |= NVME_SMART_TEMPERATURE;
4108     }
4109 
4110     current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4111     smart.power_on_hours[0] =
4112         cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4113 
4114     if (!rae) {
4115         nvme_clear_events(n, NVME_AER_TYPE_SMART);
4116     }
4117 
4118     return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4119 }
4120 
4121 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4122                                  NvmeRequest *req)
4123 {
4124     uint32_t trans_len;
4125     NvmeFwSlotInfoLog fw_log = {
4126         .afi = 0x1,
4127     };
4128 
4129     if (off >= sizeof(fw_log)) {
4130         return NVME_INVALID_FIELD | NVME_DNR;
4131     }
4132 
4133     strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
4134     trans_len = MIN(sizeof(fw_log) - off, buf_len);
4135 
4136     return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
4137 }
4138 
4139 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4140                                 uint64_t off, NvmeRequest *req)
4141 {
4142     uint32_t trans_len;
4143     NvmeErrorLog errlog;
4144 
4145     if (off >= sizeof(errlog)) {
4146         return NVME_INVALID_FIELD | NVME_DNR;
4147     }
4148 
4149     if (!rae) {
4150         nvme_clear_events(n, NVME_AER_TYPE_ERROR);
4151     }
4152 
4153     memset(&errlog, 0x0, sizeof(errlog));
4154     trans_len = MIN(sizeof(errlog) - off, buf_len);
4155 
4156     return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
4157 }
4158 
4159 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4160                                     uint64_t off, NvmeRequest *req)
4161 {
4162     uint32_t nslist[1024];
4163     uint32_t trans_len;
4164     int i = 0;
4165     uint32_t nsid;
4166 
4167     memset(nslist, 0x0, sizeof(nslist));
4168     trans_len = MIN(sizeof(nslist) - off, buf_len);
4169 
4170     while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
4171             NVME_CHANGED_NSID_SIZE) {
4172         /*
4173          * If more than 1024 namespaces, the first entry in the log page should
4174          * be set to FFFFFFFFh and the others to 0 as spec.
4175          */
4176         if (i == ARRAY_SIZE(nslist)) {
4177             memset(nslist, 0x0, sizeof(nslist));
4178             nslist[0] = 0xffffffff;
4179             break;
4180         }
4181 
4182         nslist[i++] = nsid;
4183         clear_bit(nsid, n->changed_nsids);
4184     }
4185 
4186     /*
4187      * Remove all the remaining list entries in case returns directly due to
4188      * more than 1024 namespaces.
4189      */
4190     if (nslist[0] == 0xffffffff) {
4191         bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
4192     }
4193 
4194     if (!rae) {
4195         nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
4196     }
4197 
4198     return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
4199 }
4200 
4201 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
4202                                  uint64_t off, NvmeRequest *req)
4203 {
4204     NvmeEffectsLog log = {};
4205     const uint32_t *src_iocs = NULL;
4206     uint32_t trans_len;
4207 
4208     if (off >= sizeof(log)) {
4209         trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
4210         return NVME_INVALID_FIELD | NVME_DNR;
4211     }
4212 
4213     switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
4214     case NVME_CC_CSS_NVM:
4215         src_iocs = nvme_cse_iocs_nvm;
4216         /* fall through */
4217     case NVME_CC_CSS_ADMIN_ONLY:
4218         break;
4219     case NVME_CC_CSS_CSI:
4220         switch (csi) {
4221         case NVME_CSI_NVM:
4222             src_iocs = nvme_cse_iocs_nvm;
4223             break;
4224         case NVME_CSI_ZONED:
4225             src_iocs = nvme_cse_iocs_zoned;
4226             break;
4227         }
4228     }
4229 
4230     memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4231 
4232     if (src_iocs) {
4233         memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4234     }
4235 
4236     trans_len = MIN(sizeof(log) - off, buf_len);
4237 
4238     return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4239 }
4240 
4241 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4242 {
4243     NvmeCmd *cmd = &req->cmd;
4244 
4245     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4246     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4247     uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4248     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4249     uint8_t  lid = dw10 & 0xff;
4250     uint8_t  lsp = (dw10 >> 8) & 0xf;
4251     uint8_t  rae = (dw10 >> 15) & 0x1;
4252     uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
4253     uint32_t numdl, numdu;
4254     uint64_t off, lpol, lpou;
4255     size_t   len;
4256     uint16_t status;
4257 
4258     numdl = (dw10 >> 16);
4259     numdu = (dw11 & 0xffff);
4260     lpol = dw12;
4261     lpou = dw13;
4262 
4263     len = (((numdu << 16) | numdl) + 1) << 2;
4264     off = (lpou << 32ULL) | lpol;
4265 
4266     if (off & 0x3) {
4267         return NVME_INVALID_FIELD | NVME_DNR;
4268     }
4269 
4270     trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4271 
4272     status = nvme_check_mdts(n, len);
4273     if (status) {
4274         return status;
4275     }
4276 
4277     switch (lid) {
4278     case NVME_LOG_ERROR_INFO:
4279         return nvme_error_info(n, rae, len, off, req);
4280     case NVME_LOG_SMART_INFO:
4281         return nvme_smart_info(n, rae, len, off, req);
4282     case NVME_LOG_FW_SLOT_INFO:
4283         return nvme_fw_log_info(n, len, off, req);
4284     case NVME_LOG_CHANGED_NSLIST:
4285         return nvme_changed_nslist(n, rae, len, off, req);
4286     case NVME_LOG_CMD_EFFECTS:
4287         return nvme_cmd_effects(n, csi, len, off, req);
4288     default:
4289         trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4290         return NVME_INVALID_FIELD | NVME_DNR;
4291     }
4292 }
4293 
4294 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4295 {
4296     n->cq[cq->cqid] = NULL;
4297     timer_free(cq->timer);
4298     if (msix_enabled(&n->parent_obj)) {
4299         msix_vector_unuse(&n->parent_obj, cq->vector);
4300     }
4301     if (cq->cqid) {
4302         g_free(cq);
4303     }
4304 }
4305 
4306 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4307 {
4308     NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4309     NvmeCQueue *cq;
4310     uint16_t qid = le16_to_cpu(c->qid);
4311 
4312     if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4313         trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4314         return NVME_INVALID_CQID | NVME_DNR;
4315     }
4316 
4317     cq = n->cq[qid];
4318     if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4319         trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4320         return NVME_INVALID_QUEUE_DEL;
4321     }
4322 
4323     if (cq->irq_enabled && cq->tail != cq->head) {
4324         n->cq_pending--;
4325     }
4326 
4327     nvme_irq_deassert(n, cq);
4328     trace_pci_nvme_del_cq(qid);
4329     nvme_free_cq(cq, n);
4330     return NVME_SUCCESS;
4331 }
4332 
4333 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4334                          uint16_t cqid, uint16_t vector, uint16_t size,
4335                          uint16_t irq_enabled)
4336 {
4337     int ret;
4338 
4339     if (msix_enabled(&n->parent_obj)) {
4340         ret = msix_vector_use(&n->parent_obj, vector);
4341         assert(ret == 0);
4342     }
4343     cq->ctrl = n;
4344     cq->cqid = cqid;
4345     cq->size = size;
4346     cq->dma_addr = dma_addr;
4347     cq->phase = 1;
4348     cq->irq_enabled = irq_enabled;
4349     cq->vector = vector;
4350     cq->head = cq->tail = 0;
4351     QTAILQ_INIT(&cq->req_list);
4352     QTAILQ_INIT(&cq->sq_list);
4353     n->cq[cqid] = cq;
4354     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4355 }
4356 
4357 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4358 {
4359     NvmeCQueue *cq;
4360     NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4361     uint16_t cqid = le16_to_cpu(c->cqid);
4362     uint16_t vector = le16_to_cpu(c->irq_vector);
4363     uint16_t qsize = le16_to_cpu(c->qsize);
4364     uint16_t qflags = le16_to_cpu(c->cq_flags);
4365     uint64_t prp1 = le64_to_cpu(c->prp1);
4366 
4367     trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4368                              NVME_CQ_FLAGS_IEN(qflags) != 0);
4369 
4370     if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
4371         n->cq[cqid] != NULL)) {
4372         trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4373         return NVME_INVALID_QID | NVME_DNR;
4374     }
4375     if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4376         trace_pci_nvme_err_invalid_create_cq_size(qsize);
4377         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4378     }
4379     if (unlikely(prp1 & (n->page_size - 1))) {
4380         trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4381         return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4382     }
4383     if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4384         trace_pci_nvme_err_invalid_create_cq_vector(vector);
4385         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4386     }
4387     if (unlikely(vector >= n->params.msix_qsize)) {
4388         trace_pci_nvme_err_invalid_create_cq_vector(vector);
4389         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4390     }
4391     if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4392         trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4393         return NVME_INVALID_FIELD | NVME_DNR;
4394     }
4395 
4396     cq = g_malloc0(sizeof(*cq));
4397     nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4398                  NVME_CQ_FLAGS_IEN(qflags));
4399 
4400     /*
4401      * It is only required to set qs_created when creating a completion queue;
4402      * creating a submission queue without a matching completion queue will
4403      * fail.
4404      */
4405     n->qs_created = true;
4406     return NVME_SUCCESS;
4407 }
4408 
4409 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4410 {
4411     uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4412 
4413     return nvme_c2h(n, id, sizeof(id), req);
4414 }
4415 
4416 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4417 {
4418     trace_pci_nvme_identify_ctrl();
4419 
4420     return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4421 }
4422 
4423 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4424 {
4425     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4426     uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4427     NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4428 
4429     trace_pci_nvme_identify_ctrl_csi(c->csi);
4430 
4431     switch (c->csi) {
4432     case NVME_CSI_NVM:
4433         id_nvm->vsl = n->params.vsl;
4434         id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4435         break;
4436 
4437     case NVME_CSI_ZONED:
4438         ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4439         break;
4440 
4441     default:
4442         return NVME_INVALID_FIELD | NVME_DNR;
4443     }
4444 
4445     return nvme_c2h(n, id, sizeof(id), req);
4446 }
4447 
4448 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4449 {
4450     NvmeNamespace *ns;
4451     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4452     uint32_t nsid = le32_to_cpu(c->nsid);
4453 
4454     trace_pci_nvme_identify_ns(nsid);
4455 
4456     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4457         return NVME_INVALID_NSID | NVME_DNR;
4458     }
4459 
4460     ns = nvme_ns(n, nsid);
4461     if (unlikely(!ns)) {
4462         if (!active) {
4463             ns = nvme_subsys_ns(n->subsys, nsid);
4464             if (!ns) {
4465                 return nvme_rpt_empty_id_struct(n, req);
4466             }
4467         } else {
4468             return nvme_rpt_empty_id_struct(n, req);
4469         }
4470     }
4471 
4472     if (active || ns->csi == NVME_CSI_NVM) {
4473         return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4474     }
4475 
4476     return NVME_INVALID_CMD_SET | NVME_DNR;
4477 }
4478 
4479 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
4480                                         bool attached)
4481 {
4482     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4483     uint32_t nsid = le32_to_cpu(c->nsid);
4484     uint16_t min_id = le16_to_cpu(c->ctrlid);
4485     uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4486     uint16_t *ids = &list[1];
4487     NvmeNamespace *ns;
4488     NvmeCtrl *ctrl;
4489     int cntlid, nr_ids = 0;
4490 
4491     trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
4492 
4493     if (!n->subsys) {
4494         return NVME_INVALID_FIELD | NVME_DNR;
4495     }
4496 
4497     if (attached) {
4498         if (nsid == NVME_NSID_BROADCAST) {
4499             return NVME_INVALID_FIELD | NVME_DNR;
4500         }
4501 
4502         ns = nvme_subsys_ns(n->subsys, nsid);
4503         if (!ns) {
4504             return NVME_INVALID_FIELD | NVME_DNR;
4505         }
4506     }
4507 
4508     for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4509         ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4510         if (!ctrl) {
4511             continue;
4512         }
4513 
4514         if (attached && !nvme_ns(ctrl, nsid)) {
4515             continue;
4516         }
4517 
4518         ids[nr_ids++] = cntlid;
4519     }
4520 
4521     list[0] = nr_ids;
4522 
4523     return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4524 }
4525 
4526 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4527                                      bool active)
4528 {
4529     NvmeNamespace *ns;
4530     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4531     uint32_t nsid = le32_to_cpu(c->nsid);
4532 
4533     trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4534 
4535     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4536         return NVME_INVALID_NSID | NVME_DNR;
4537     }
4538 
4539     ns = nvme_ns(n, nsid);
4540     if (unlikely(!ns)) {
4541         if (!active) {
4542             ns = nvme_subsys_ns(n->subsys, nsid);
4543             if (!ns) {
4544                 return nvme_rpt_empty_id_struct(n, req);
4545             }
4546         } else {
4547             return nvme_rpt_empty_id_struct(n, req);
4548         }
4549     }
4550 
4551     if (c->csi == NVME_CSI_NVM) {
4552         return nvme_rpt_empty_id_struct(n, req);
4553     } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
4554         return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
4555                         req);
4556     }
4557 
4558     return NVME_INVALID_FIELD | NVME_DNR;
4559 }
4560 
4561 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
4562                                      bool active)
4563 {
4564     NvmeNamespace *ns;
4565     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4566     uint32_t min_nsid = le32_to_cpu(c->nsid);
4567     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4568     static const int data_len = sizeof(list);
4569     uint32_t *list_ptr = (uint32_t *)list;
4570     int i, j = 0;
4571 
4572     trace_pci_nvme_identify_nslist(min_nsid);
4573 
4574     /*
4575      * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
4576      * since the Active Namespace ID List should return namespaces with ids
4577      * *higher* than the NSID specified in the command. This is also specified
4578      * in the spec (NVM Express v1.3d, Section 5.15.4).
4579      */
4580     if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4581         return NVME_INVALID_NSID | NVME_DNR;
4582     }
4583 
4584     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4585         ns = nvme_ns(n, i);
4586         if (!ns) {
4587             if (!active) {
4588                 ns = nvme_subsys_ns(n->subsys, i);
4589                 if (!ns) {
4590                     continue;
4591                 }
4592             } else {
4593                 continue;
4594             }
4595         }
4596         if (ns->params.nsid <= min_nsid) {
4597             continue;
4598         }
4599         list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4600         if (j == data_len / sizeof(uint32_t)) {
4601             break;
4602         }
4603     }
4604 
4605     return nvme_c2h(n, list, data_len, req);
4606 }
4607 
4608 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
4609                                          bool active)
4610 {
4611     NvmeNamespace *ns;
4612     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4613     uint32_t min_nsid = le32_to_cpu(c->nsid);
4614     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4615     static const int data_len = sizeof(list);
4616     uint32_t *list_ptr = (uint32_t *)list;
4617     int i, j = 0;
4618 
4619     trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
4620 
4621     /*
4622      * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
4623      */
4624     if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4625         return NVME_INVALID_NSID | NVME_DNR;
4626     }
4627 
4628     if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
4629         return NVME_INVALID_FIELD | NVME_DNR;
4630     }
4631 
4632     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4633         ns = nvme_ns(n, i);
4634         if (!ns) {
4635             if (!active) {
4636                 ns = nvme_subsys_ns(n->subsys, i);
4637                 if (!ns) {
4638                     continue;
4639                 }
4640             } else {
4641                 continue;
4642             }
4643         }
4644         if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
4645             continue;
4646         }
4647         list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4648         if (j == data_len / sizeof(uint32_t)) {
4649             break;
4650         }
4651     }
4652 
4653     return nvme_c2h(n, list, data_len, req);
4654 }
4655 
4656 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
4657 {
4658     NvmeNamespace *ns;
4659     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4660     uint32_t nsid = le32_to_cpu(c->nsid);
4661     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4662     uint8_t *pos = list;
4663     struct {
4664         NvmeIdNsDescr hdr;
4665         uint8_t v[NVME_NIDL_UUID];
4666     } QEMU_PACKED uuid = {};
4667     struct {
4668         NvmeIdNsDescr hdr;
4669         uint64_t v;
4670     } QEMU_PACKED eui64 = {};
4671     struct {
4672         NvmeIdNsDescr hdr;
4673         uint8_t v;
4674     } QEMU_PACKED csi = {};
4675 
4676     trace_pci_nvme_identify_ns_descr_list(nsid);
4677 
4678     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4679         return NVME_INVALID_NSID | NVME_DNR;
4680     }
4681 
4682     ns = nvme_ns(n, nsid);
4683     if (unlikely(!ns)) {
4684         return NVME_INVALID_FIELD | NVME_DNR;
4685     }
4686 
4687     /*
4688      * If the EUI-64 field is 0 and the NGUID field is 0, the namespace must
4689      * provide a valid Namespace UUID in the Namespace Identification Descriptor
4690      * data structure. QEMU does not yet support setting NGUID.
4691      */
4692     uuid.hdr.nidt = NVME_NIDT_UUID;
4693     uuid.hdr.nidl = NVME_NIDL_UUID;
4694     memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
4695     memcpy(pos, &uuid, sizeof(uuid));
4696     pos += sizeof(uuid);
4697 
4698     if (ns->params.eui64) {
4699         eui64.hdr.nidt = NVME_NIDT_EUI64;
4700         eui64.hdr.nidl = NVME_NIDL_EUI64;
4701         eui64.v = cpu_to_be64(ns->params.eui64);
4702         memcpy(pos, &eui64, sizeof(eui64));
4703         pos += sizeof(eui64);
4704     }
4705 
4706     csi.hdr.nidt = NVME_NIDT_CSI;
4707     csi.hdr.nidl = NVME_NIDL_CSI;
4708     csi.v = ns->csi;
4709     memcpy(pos, &csi, sizeof(csi));
4710     pos += sizeof(csi);
4711 
4712     return nvme_c2h(n, list, sizeof(list), req);
4713 }
4714 
4715 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
4716 {
4717     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4718     static const int data_len = sizeof(list);
4719 
4720     trace_pci_nvme_identify_cmd_set();
4721 
4722     NVME_SET_CSI(*list, NVME_CSI_NVM);
4723     NVME_SET_CSI(*list, NVME_CSI_ZONED);
4724 
4725     return nvme_c2h(n, list, data_len, req);
4726 }
4727 
4728 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
4729 {
4730     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4731 
4732     trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
4733                             c->csi);
4734 
4735     switch (c->cns) {
4736     case NVME_ID_CNS_NS:
4737         return nvme_identify_ns(n, req, true);
4738     case NVME_ID_CNS_NS_PRESENT:
4739         return nvme_identify_ns(n, req, false);
4740     case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
4741         return nvme_identify_ctrl_list(n, req, true);
4742     case NVME_ID_CNS_CTRL_LIST:
4743         return nvme_identify_ctrl_list(n, req, false);
4744     case NVME_ID_CNS_CS_NS:
4745         return nvme_identify_ns_csi(n, req, true);
4746     case NVME_ID_CNS_CS_NS_PRESENT:
4747         return nvme_identify_ns_csi(n, req, false);
4748     case NVME_ID_CNS_CTRL:
4749         return nvme_identify_ctrl(n, req);
4750     case NVME_ID_CNS_CS_CTRL:
4751         return nvme_identify_ctrl_csi(n, req);
4752     case NVME_ID_CNS_NS_ACTIVE_LIST:
4753         return nvme_identify_nslist(n, req, true);
4754     case NVME_ID_CNS_NS_PRESENT_LIST:
4755         return nvme_identify_nslist(n, req, false);
4756     case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
4757         return nvme_identify_nslist_csi(n, req, true);
4758     case NVME_ID_CNS_CS_NS_PRESENT_LIST:
4759         return nvme_identify_nslist_csi(n, req, false);
4760     case NVME_ID_CNS_NS_DESCR_LIST:
4761         return nvme_identify_ns_descr_list(n, req);
4762     case NVME_ID_CNS_IO_COMMAND_SET:
4763         return nvme_identify_cmd_set(n, req);
4764     default:
4765         trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
4766         return NVME_INVALID_FIELD | NVME_DNR;
4767     }
4768 }
4769 
4770 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
4771 {
4772     uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
4773 
4774     req->cqe.result = 1;
4775     if (nvme_check_sqid(n, sqid)) {
4776         return NVME_INVALID_FIELD | NVME_DNR;
4777     }
4778 
4779     return NVME_SUCCESS;
4780 }
4781 
4782 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
4783 {
4784     trace_pci_nvme_setfeat_timestamp(ts);
4785 
4786     n->host_timestamp = le64_to_cpu(ts);
4787     n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4788 }
4789 
4790 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
4791 {
4792     uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4793     uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
4794 
4795     union nvme_timestamp {
4796         struct {
4797             uint64_t timestamp:48;
4798             uint64_t sync:1;
4799             uint64_t origin:3;
4800             uint64_t rsvd1:12;
4801         };
4802         uint64_t all;
4803     };
4804 
4805     union nvme_timestamp ts;
4806     ts.all = 0;
4807     ts.timestamp = n->host_timestamp + elapsed_time;
4808 
4809     /* If the host timestamp is non-zero, set the timestamp origin */
4810     ts.origin = n->host_timestamp ? 0x01 : 0x00;
4811 
4812     trace_pci_nvme_getfeat_timestamp(ts.all);
4813 
4814     return cpu_to_le64(ts.all);
4815 }
4816 
4817 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4818 {
4819     uint64_t timestamp = nvme_get_timestamp(n);
4820 
4821     return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
4822 }
4823 
4824 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
4825 {
4826     NvmeCmd *cmd = &req->cmd;
4827     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4828     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4829     uint32_t nsid = le32_to_cpu(cmd->nsid);
4830     uint32_t result;
4831     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4832     NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
4833     uint16_t iv;
4834     NvmeNamespace *ns;
4835     int i;
4836 
4837     static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
4838         [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
4839     };
4840 
4841     trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
4842 
4843     if (!nvme_feature_support[fid]) {
4844         return NVME_INVALID_FIELD | NVME_DNR;
4845     }
4846 
4847     if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4848         if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4849             /*
4850              * The Reservation Notification Mask and Reservation Persistence
4851              * features require a status code of Invalid Field in Command when
4852              * NSID is FFFFFFFFh. Since the device does not support those
4853              * features we can always return Invalid Namespace or Format as we
4854              * should do for all other features.
4855              */
4856             return NVME_INVALID_NSID | NVME_DNR;
4857         }
4858 
4859         if (!nvme_ns(n, nsid)) {
4860             return NVME_INVALID_FIELD | NVME_DNR;
4861         }
4862     }
4863 
4864     switch (sel) {
4865     case NVME_GETFEAT_SELECT_CURRENT:
4866         break;
4867     case NVME_GETFEAT_SELECT_SAVED:
4868         /* no features are saveable by the controller; fallthrough */
4869     case NVME_GETFEAT_SELECT_DEFAULT:
4870         goto defaults;
4871     case NVME_GETFEAT_SELECT_CAP:
4872         result = nvme_feature_cap[fid];
4873         goto out;
4874     }
4875 
4876     switch (fid) {
4877     case NVME_TEMPERATURE_THRESHOLD:
4878         result = 0;
4879 
4880         /*
4881          * The controller only implements the Composite Temperature sensor, so
4882          * return 0 for all other sensors.
4883          */
4884         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4885             goto out;
4886         }
4887 
4888         switch (NVME_TEMP_THSEL(dw11)) {
4889         case NVME_TEMP_THSEL_OVER:
4890             result = n->features.temp_thresh_hi;
4891             goto out;
4892         case NVME_TEMP_THSEL_UNDER:
4893             result = n->features.temp_thresh_low;
4894             goto out;
4895         }
4896 
4897         return NVME_INVALID_FIELD | NVME_DNR;
4898     case NVME_ERROR_RECOVERY:
4899         if (!nvme_nsid_valid(n, nsid)) {
4900             return NVME_INVALID_NSID | NVME_DNR;
4901         }
4902 
4903         ns = nvme_ns(n, nsid);
4904         if (unlikely(!ns)) {
4905             return NVME_INVALID_FIELD | NVME_DNR;
4906         }
4907 
4908         result = ns->features.err_rec;
4909         goto out;
4910     case NVME_VOLATILE_WRITE_CACHE:
4911         result = 0;
4912         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4913             ns = nvme_ns(n, i);
4914             if (!ns) {
4915                 continue;
4916             }
4917 
4918             result = blk_enable_write_cache(ns->blkconf.blk);
4919             if (result) {
4920                 break;
4921             }
4922         }
4923         trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
4924         goto out;
4925     case NVME_ASYNCHRONOUS_EVENT_CONF:
4926         result = n->features.async_config;
4927         goto out;
4928     case NVME_TIMESTAMP:
4929         return nvme_get_feature_timestamp(n, req);
4930     default:
4931         break;
4932     }
4933 
4934 defaults:
4935     switch (fid) {
4936     case NVME_TEMPERATURE_THRESHOLD:
4937         result = 0;
4938 
4939         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4940             break;
4941         }
4942 
4943         if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
4944             result = NVME_TEMPERATURE_WARNING;
4945         }
4946 
4947         break;
4948     case NVME_NUMBER_OF_QUEUES:
4949         result = (n->params.max_ioqpairs - 1) |
4950             ((n->params.max_ioqpairs - 1) << 16);
4951         trace_pci_nvme_getfeat_numq(result);
4952         break;
4953     case NVME_INTERRUPT_VECTOR_CONF:
4954         iv = dw11 & 0xffff;
4955         if (iv >= n->params.max_ioqpairs + 1) {
4956             return NVME_INVALID_FIELD | NVME_DNR;
4957         }
4958 
4959         result = iv;
4960         if (iv == n->admin_cq.vector) {
4961             result |= NVME_INTVC_NOCOALESCING;
4962         }
4963         break;
4964     default:
4965         result = nvme_feature_default[fid];
4966         break;
4967     }
4968 
4969 out:
4970     req->cqe.result = cpu_to_le32(result);
4971     return NVME_SUCCESS;
4972 }
4973 
4974 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4975 {
4976     uint16_t ret;
4977     uint64_t timestamp;
4978 
4979     ret = nvme_h2c(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
4980     if (ret) {
4981         return ret;
4982     }
4983 
4984     nvme_set_timestamp(n, timestamp);
4985 
4986     return NVME_SUCCESS;
4987 }
4988 
4989 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
4990 {
4991     NvmeNamespace *ns = NULL;
4992 
4993     NvmeCmd *cmd = &req->cmd;
4994     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4995     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4996     uint32_t nsid = le32_to_cpu(cmd->nsid);
4997     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4998     uint8_t save = NVME_SETFEAT_SAVE(dw10);
4999     int i;
5000 
5001     trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
5002 
5003     if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
5004         return NVME_FID_NOT_SAVEABLE | NVME_DNR;
5005     }
5006 
5007     if (!nvme_feature_support[fid]) {
5008         return NVME_INVALID_FIELD | NVME_DNR;
5009     }
5010 
5011     if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5012         if (nsid != NVME_NSID_BROADCAST) {
5013             if (!nvme_nsid_valid(n, nsid)) {
5014                 return NVME_INVALID_NSID | NVME_DNR;
5015             }
5016 
5017             ns = nvme_ns(n, nsid);
5018             if (unlikely(!ns)) {
5019                 return NVME_INVALID_FIELD | NVME_DNR;
5020             }
5021         }
5022     } else if (nsid && nsid != NVME_NSID_BROADCAST) {
5023         if (!nvme_nsid_valid(n, nsid)) {
5024             return NVME_INVALID_NSID | NVME_DNR;
5025         }
5026 
5027         return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
5028     }
5029 
5030     if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
5031         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5032     }
5033 
5034     switch (fid) {
5035     case NVME_TEMPERATURE_THRESHOLD:
5036         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5037             break;
5038         }
5039 
5040         switch (NVME_TEMP_THSEL(dw11)) {
5041         case NVME_TEMP_THSEL_OVER:
5042             n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
5043             break;
5044         case NVME_TEMP_THSEL_UNDER:
5045             n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
5046             break;
5047         default:
5048             return NVME_INVALID_FIELD | NVME_DNR;
5049         }
5050 
5051         if ((n->temperature >= n->features.temp_thresh_hi) ||
5052             (n->temperature <= n->features.temp_thresh_low)) {
5053             nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
5054         }
5055 
5056         break;
5057     case NVME_ERROR_RECOVERY:
5058         if (nsid == NVME_NSID_BROADCAST) {
5059             for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5060                 ns = nvme_ns(n, i);
5061 
5062                 if (!ns) {
5063                     continue;
5064                 }
5065 
5066                 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5067                     ns->features.err_rec = dw11;
5068                 }
5069             }
5070 
5071             break;
5072         }
5073 
5074         assert(ns);
5075         if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat))  {
5076             ns->features.err_rec = dw11;
5077         }
5078         break;
5079     case NVME_VOLATILE_WRITE_CACHE:
5080         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5081             ns = nvme_ns(n, i);
5082             if (!ns) {
5083                 continue;
5084             }
5085 
5086             if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
5087                 blk_flush(ns->blkconf.blk);
5088             }
5089 
5090             blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
5091         }
5092 
5093         break;
5094 
5095     case NVME_NUMBER_OF_QUEUES:
5096         if (n->qs_created) {
5097             return NVME_CMD_SEQ_ERROR | NVME_DNR;
5098         }
5099 
5100         /*
5101          * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
5102          * and NSQR.
5103          */
5104         if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
5105             return NVME_INVALID_FIELD | NVME_DNR;
5106         }
5107 
5108         trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
5109                                     ((dw11 >> 16) & 0xffff) + 1,
5110                                     n->params.max_ioqpairs,
5111                                     n->params.max_ioqpairs);
5112         req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
5113                                       ((n->params.max_ioqpairs - 1) << 16));
5114         break;
5115     case NVME_ASYNCHRONOUS_EVENT_CONF:
5116         n->features.async_config = dw11;
5117         break;
5118     case NVME_TIMESTAMP:
5119         return nvme_set_feature_timestamp(n, req);
5120     case NVME_COMMAND_SET_PROFILE:
5121         if (dw11 & 0x1ff) {
5122             trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
5123             return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
5124         }
5125         break;
5126     default:
5127         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5128     }
5129     return NVME_SUCCESS;
5130 }
5131 
5132 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
5133 {
5134     trace_pci_nvme_aer(nvme_cid(req));
5135 
5136     if (n->outstanding_aers > n->params.aerl) {
5137         trace_pci_nvme_aer_aerl_exceeded();
5138         return NVME_AER_LIMIT_EXCEEDED;
5139     }
5140 
5141     n->aer_reqs[n->outstanding_aers] = req;
5142     n->outstanding_aers++;
5143 
5144     if (!QTAILQ_EMPTY(&n->aer_queue)) {
5145         nvme_process_aers(n);
5146     }
5147 
5148     return NVME_NO_COMPLETE;
5149 }
5150 
5151 static void nvme_update_dmrsl(NvmeCtrl *n)
5152 {
5153     int nsid;
5154 
5155     for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
5156         NvmeNamespace *ns = nvme_ns(n, nsid);
5157         if (!ns) {
5158             continue;
5159         }
5160 
5161         n->dmrsl = MIN_NON_ZERO(n->dmrsl,
5162                                 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
5163     }
5164 }
5165 
5166 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
5167 {
5168     uint32_t cc = ldl_le_p(&n->bar.cc);
5169 
5170     ns->iocs = nvme_cse_iocs_none;
5171     switch (ns->csi) {
5172     case NVME_CSI_NVM:
5173         if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
5174             ns->iocs = nvme_cse_iocs_nvm;
5175         }
5176         break;
5177     case NVME_CSI_ZONED:
5178         if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
5179             ns->iocs = nvme_cse_iocs_zoned;
5180         } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
5181             ns->iocs = nvme_cse_iocs_nvm;
5182         }
5183         break;
5184     }
5185 }
5186 
5187 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
5188 {
5189     NvmeNamespace *ns;
5190     NvmeCtrl *ctrl;
5191     uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5192     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5193     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5194     bool attach = !(dw10 & 0xf);
5195     uint16_t *nr_ids = &list[0];
5196     uint16_t *ids = &list[1];
5197     uint16_t ret;
5198     int i;
5199 
5200     trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
5201 
5202     if (!nvme_nsid_valid(n, nsid)) {
5203         return NVME_INVALID_NSID | NVME_DNR;
5204     }
5205 
5206     ns = nvme_subsys_ns(n->subsys, nsid);
5207     if (!ns) {
5208         return NVME_INVALID_FIELD | NVME_DNR;
5209     }
5210 
5211     ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
5212     if (ret) {
5213         return ret;
5214     }
5215 
5216     if (!*nr_ids) {
5217         return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5218     }
5219 
5220     *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
5221     for (i = 0; i < *nr_ids; i++) {
5222         ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
5223         if (!ctrl) {
5224             return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5225         }
5226 
5227         if (attach) {
5228             if (nvme_ns(ctrl, nsid)) {
5229                 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
5230             }
5231 
5232             if (ns->attached && !ns->params.shared) {
5233                 return NVME_NS_PRIVATE | NVME_DNR;
5234             }
5235 
5236             nvme_attach_ns(ctrl, ns);
5237             nvme_select_iocs_ns(ctrl, ns);
5238         } else {
5239             if (!nvme_ns(ctrl, nsid)) {
5240                 return NVME_NS_NOT_ATTACHED | NVME_DNR;
5241             }
5242 
5243             ctrl->namespaces[nsid] = NULL;
5244             ns->attached--;
5245 
5246             nvme_update_dmrsl(ctrl);
5247         }
5248 
5249         /*
5250          * Add namespace id to the changed namespace id list for event clearing
5251          * via Get Log Page command.
5252          */
5253         if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
5254             nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
5255                                NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
5256                                NVME_LOG_CHANGED_NSLIST);
5257         }
5258     }
5259 
5260     return NVME_SUCCESS;
5261 }
5262 
5263 typedef struct NvmeFormatAIOCB {
5264     BlockAIOCB common;
5265     BlockAIOCB *aiocb;
5266     QEMUBH *bh;
5267     NvmeRequest *req;
5268     int ret;
5269 
5270     NvmeNamespace *ns;
5271     uint32_t nsid;
5272     bool broadcast;
5273     int64_t offset;
5274 } NvmeFormatAIOCB;
5275 
5276 static void nvme_format_bh(void *opaque);
5277 
5278 static void nvme_format_cancel(BlockAIOCB *aiocb)
5279 {
5280     NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
5281 
5282     if (iocb->aiocb) {
5283         blk_aio_cancel_async(iocb->aiocb);
5284     }
5285 }
5286 
5287 static const AIOCBInfo nvme_format_aiocb_info = {
5288     .aiocb_size = sizeof(NvmeFormatAIOCB),
5289     .cancel_async = nvme_format_cancel,
5290     .get_aio_context = nvme_get_aio_context,
5291 };
5292 
5293 static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd)
5294 {
5295     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5296     uint8_t lbaf = dw10 & 0xf;
5297     uint8_t pi = (dw10 >> 5) & 0x7;
5298     uint8_t mset = (dw10 >> 4) & 0x1;
5299     uint8_t pil = (dw10 >> 8) & 0x1;
5300 
5301     trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
5302 
5303     ns->id_ns.dps = (pil << 3) | pi;
5304     ns->id_ns.flbas = lbaf | (mset << 4);
5305 
5306     nvme_ns_init_format(ns);
5307 }
5308 
5309 static void nvme_format_ns_cb(void *opaque, int ret)
5310 {
5311     NvmeFormatAIOCB *iocb = opaque;
5312     NvmeRequest *req = iocb->req;
5313     NvmeNamespace *ns = iocb->ns;
5314     int bytes;
5315 
5316     if (ret < 0) {
5317         iocb->ret = ret;
5318         goto done;
5319     }
5320 
5321     assert(ns);
5322 
5323     if (iocb->offset < ns->size) {
5324         bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
5325 
5326         iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
5327                                             bytes, BDRV_REQ_MAY_UNMAP,
5328                                             nvme_format_ns_cb, iocb);
5329 
5330         iocb->offset += bytes;
5331         return;
5332     }
5333 
5334     nvme_format_set(ns, &req->cmd);
5335     ns->status = 0x0;
5336     iocb->ns = NULL;
5337     iocb->offset = 0;
5338 
5339 done:
5340     iocb->aiocb = NULL;
5341     qemu_bh_schedule(iocb->bh);
5342 }
5343 
5344 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
5345 {
5346     if (ns->params.zoned) {
5347         return NVME_INVALID_FORMAT | NVME_DNR;
5348     }
5349 
5350     if (lbaf > ns->id_ns.nlbaf) {
5351         return NVME_INVALID_FORMAT | NVME_DNR;
5352     }
5353 
5354     if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) {
5355         return NVME_INVALID_FORMAT | NVME_DNR;
5356     }
5357 
5358     if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5359         return NVME_INVALID_FIELD | NVME_DNR;
5360     }
5361 
5362     return NVME_SUCCESS;
5363 }
5364 
5365 static void nvme_format_bh(void *opaque)
5366 {
5367     NvmeFormatAIOCB *iocb = opaque;
5368     NvmeRequest *req = iocb->req;
5369     NvmeCtrl *n = nvme_ctrl(req);
5370     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5371     uint8_t lbaf = dw10 & 0xf;
5372     uint8_t pi = (dw10 >> 5) & 0x7;
5373     uint16_t status;
5374     int i;
5375 
5376     if (iocb->ret < 0) {
5377         goto done;
5378     }
5379 
5380     if (iocb->broadcast) {
5381         for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
5382             iocb->ns = nvme_ns(n, i);
5383             if (iocb->ns) {
5384                 iocb->nsid = i;
5385                 break;
5386             }
5387         }
5388     }
5389 
5390     if (!iocb->ns) {
5391         goto done;
5392     }
5393 
5394     status = nvme_format_check(iocb->ns, lbaf, pi);
5395     if (status) {
5396         req->status = status;
5397         goto done;
5398     }
5399 
5400     iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
5401     nvme_format_ns_cb(iocb, 0);
5402     return;
5403 
5404 done:
5405     qemu_bh_delete(iocb->bh);
5406     iocb->bh = NULL;
5407 
5408     iocb->common.cb(iocb->common.opaque, iocb->ret);
5409 
5410     qemu_aio_unref(iocb);
5411 }
5412 
5413 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5414 {
5415     NvmeFormatAIOCB *iocb;
5416     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5417     uint16_t status;
5418 
5419     iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
5420 
5421     iocb->req = req;
5422     iocb->bh = qemu_bh_new(nvme_format_bh, iocb);
5423     iocb->ret = 0;
5424     iocb->ns = NULL;
5425     iocb->nsid = 0;
5426     iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
5427     iocb->offset = 0;
5428 
5429     if (!iocb->broadcast) {
5430         if (!nvme_nsid_valid(n, nsid)) {
5431             status = NVME_INVALID_NSID | NVME_DNR;
5432             goto out;
5433         }
5434 
5435         iocb->ns = nvme_ns(n, nsid);
5436         if (!iocb->ns) {
5437             status = NVME_INVALID_FIELD | NVME_DNR;
5438             goto out;
5439         }
5440     }
5441 
5442     req->aiocb = &iocb->common;
5443     qemu_bh_schedule(iocb->bh);
5444 
5445     return NVME_NO_COMPLETE;
5446 
5447 out:
5448     qemu_bh_delete(iocb->bh);
5449     iocb->bh = NULL;
5450     qemu_aio_unref(iocb);
5451     return status;
5452 }
5453 
5454 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
5455 {
5456     trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
5457                              nvme_adm_opc_str(req->cmd.opcode));
5458 
5459     if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
5460         trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
5461         return NVME_INVALID_OPCODE | NVME_DNR;
5462     }
5463 
5464     /* SGLs shall not be used for Admin commands in NVMe over PCIe */
5465     if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
5466         return NVME_INVALID_FIELD | NVME_DNR;
5467     }
5468 
5469     switch (req->cmd.opcode) {
5470     case NVME_ADM_CMD_DELETE_SQ:
5471         return nvme_del_sq(n, req);
5472     case NVME_ADM_CMD_CREATE_SQ:
5473         return nvme_create_sq(n, req);
5474     case NVME_ADM_CMD_GET_LOG_PAGE:
5475         return nvme_get_log(n, req);
5476     case NVME_ADM_CMD_DELETE_CQ:
5477         return nvme_del_cq(n, req);
5478     case NVME_ADM_CMD_CREATE_CQ:
5479         return nvme_create_cq(n, req);
5480     case NVME_ADM_CMD_IDENTIFY:
5481         return nvme_identify(n, req);
5482     case NVME_ADM_CMD_ABORT:
5483         return nvme_abort(n, req);
5484     case NVME_ADM_CMD_SET_FEATURES:
5485         return nvme_set_feature(n, req);
5486     case NVME_ADM_CMD_GET_FEATURES:
5487         return nvme_get_feature(n, req);
5488     case NVME_ADM_CMD_ASYNC_EV_REQ:
5489         return nvme_aer(n, req);
5490     case NVME_ADM_CMD_NS_ATTACHMENT:
5491         return nvme_ns_attachment(n, req);
5492     case NVME_ADM_CMD_FORMAT_NVM:
5493         return nvme_format(n, req);
5494     default:
5495         assert(false);
5496     }
5497 
5498     return NVME_INVALID_OPCODE | NVME_DNR;
5499 }
5500 
5501 static void nvme_process_sq(void *opaque)
5502 {
5503     NvmeSQueue *sq = opaque;
5504     NvmeCtrl *n = sq->ctrl;
5505     NvmeCQueue *cq = n->cq[sq->cqid];
5506 
5507     uint16_t status;
5508     hwaddr addr;
5509     NvmeCmd cmd;
5510     NvmeRequest *req;
5511 
5512     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
5513         addr = sq->dma_addr + sq->head * n->sqe_size;
5514         if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
5515             trace_pci_nvme_err_addr_read(addr);
5516             trace_pci_nvme_err_cfs();
5517             stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
5518             break;
5519         }
5520         nvme_inc_sq_head(sq);
5521 
5522         req = QTAILQ_FIRST(&sq->req_list);
5523         QTAILQ_REMOVE(&sq->req_list, req, entry);
5524         QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
5525         nvme_req_clear(req);
5526         req->cqe.cid = cmd.cid;
5527         memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
5528 
5529         status = sq->sqid ? nvme_io_cmd(n, req) :
5530             nvme_admin_cmd(n, req);
5531         if (status != NVME_NO_COMPLETE) {
5532             req->status = status;
5533             nvme_enqueue_req_completion(cq, req);
5534         }
5535     }
5536 }
5537 
5538 static void nvme_ctrl_reset(NvmeCtrl *n)
5539 {
5540     NvmeNamespace *ns;
5541     int i;
5542 
5543     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5544         ns = nvme_ns(n, i);
5545         if (!ns) {
5546             continue;
5547         }
5548 
5549         nvme_ns_drain(ns);
5550     }
5551 
5552     for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5553         if (n->sq[i] != NULL) {
5554             nvme_free_sq(n->sq[i], n);
5555         }
5556     }
5557     for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5558         if (n->cq[i] != NULL) {
5559             nvme_free_cq(n->cq[i], n);
5560         }
5561     }
5562 
5563     while (!QTAILQ_EMPTY(&n->aer_queue)) {
5564         NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
5565         QTAILQ_REMOVE(&n->aer_queue, event, entry);
5566         g_free(event);
5567     }
5568 
5569     n->aer_queued = 0;
5570     n->outstanding_aers = 0;
5571     n->qs_created = false;
5572 }
5573 
5574 static void nvme_ctrl_shutdown(NvmeCtrl *n)
5575 {
5576     NvmeNamespace *ns;
5577     int i;
5578 
5579     if (n->pmr.dev) {
5580         memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5581     }
5582 
5583     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5584         ns = nvme_ns(n, i);
5585         if (!ns) {
5586             continue;
5587         }
5588 
5589         nvme_ns_shutdown(ns);
5590     }
5591 }
5592 
5593 static void nvme_select_iocs(NvmeCtrl *n)
5594 {
5595     NvmeNamespace *ns;
5596     int i;
5597 
5598     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5599         ns = nvme_ns(n, i);
5600         if (!ns) {
5601             continue;
5602         }
5603 
5604         nvme_select_iocs_ns(n, ns);
5605     }
5606 }
5607 
5608 static int nvme_start_ctrl(NvmeCtrl *n)
5609 {
5610     uint64_t cap = ldq_le_p(&n->bar.cap);
5611     uint32_t cc = ldl_le_p(&n->bar.cc);
5612     uint32_t aqa = ldl_le_p(&n->bar.aqa);
5613     uint64_t asq = ldq_le_p(&n->bar.asq);
5614     uint64_t acq = ldq_le_p(&n->bar.acq);
5615     uint32_t page_bits = NVME_CC_MPS(cc) + 12;
5616     uint32_t page_size = 1 << page_bits;
5617 
5618     if (unlikely(n->cq[0])) {
5619         trace_pci_nvme_err_startfail_cq();
5620         return -1;
5621     }
5622     if (unlikely(n->sq[0])) {
5623         trace_pci_nvme_err_startfail_sq();
5624         return -1;
5625     }
5626     if (unlikely(!asq)) {
5627         trace_pci_nvme_err_startfail_nbarasq();
5628         return -1;
5629     }
5630     if (unlikely(!acq)) {
5631         trace_pci_nvme_err_startfail_nbaracq();
5632         return -1;
5633     }
5634     if (unlikely(asq & (page_size - 1))) {
5635         trace_pci_nvme_err_startfail_asq_misaligned(asq);
5636         return -1;
5637     }
5638     if (unlikely(acq & (page_size - 1))) {
5639         trace_pci_nvme_err_startfail_acq_misaligned(acq);
5640         return -1;
5641     }
5642     if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
5643         trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
5644         return -1;
5645     }
5646     if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
5647         trace_pci_nvme_err_startfail_page_too_small(
5648                     NVME_CC_MPS(cc),
5649                     NVME_CAP_MPSMIN(cap));
5650         return -1;
5651     }
5652     if (unlikely(NVME_CC_MPS(cc) >
5653                  NVME_CAP_MPSMAX(cap))) {
5654         trace_pci_nvme_err_startfail_page_too_large(
5655                     NVME_CC_MPS(cc),
5656                     NVME_CAP_MPSMAX(cap));
5657         return -1;
5658     }
5659     if (unlikely(NVME_CC_IOCQES(cc) <
5660                  NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
5661         trace_pci_nvme_err_startfail_cqent_too_small(
5662                     NVME_CC_IOCQES(cc),
5663                     NVME_CTRL_CQES_MIN(cap));
5664         return -1;
5665     }
5666     if (unlikely(NVME_CC_IOCQES(cc) >
5667                  NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
5668         trace_pci_nvme_err_startfail_cqent_too_large(
5669                     NVME_CC_IOCQES(cc),
5670                     NVME_CTRL_CQES_MAX(cap));
5671         return -1;
5672     }
5673     if (unlikely(NVME_CC_IOSQES(cc) <
5674                  NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
5675         trace_pci_nvme_err_startfail_sqent_too_small(
5676                     NVME_CC_IOSQES(cc),
5677                     NVME_CTRL_SQES_MIN(cap));
5678         return -1;
5679     }
5680     if (unlikely(NVME_CC_IOSQES(cc) >
5681                  NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
5682         trace_pci_nvme_err_startfail_sqent_too_large(
5683                     NVME_CC_IOSQES(cc),
5684                     NVME_CTRL_SQES_MAX(cap));
5685         return -1;
5686     }
5687     if (unlikely(!NVME_AQA_ASQS(aqa))) {
5688         trace_pci_nvme_err_startfail_asqent_sz_zero();
5689         return -1;
5690     }
5691     if (unlikely(!NVME_AQA_ACQS(aqa))) {
5692         trace_pci_nvme_err_startfail_acqent_sz_zero();
5693         return -1;
5694     }
5695 
5696     n->page_bits = page_bits;
5697     n->page_size = page_size;
5698     n->max_prp_ents = n->page_size / sizeof(uint64_t);
5699     n->cqe_size = 1 << NVME_CC_IOCQES(cc);
5700     n->sqe_size = 1 << NVME_CC_IOSQES(cc);
5701     nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
5702     nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
5703 
5704     nvme_set_timestamp(n, 0ULL);
5705 
5706     QTAILQ_INIT(&n->aer_queue);
5707 
5708     nvme_select_iocs(n);
5709 
5710     return 0;
5711 }
5712 
5713 static void nvme_cmb_enable_regs(NvmeCtrl *n)
5714 {
5715     uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
5716     uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
5717 
5718     NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
5719     NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
5720     NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
5721     stl_le_p(&n->bar.cmbloc, cmbloc);
5722 
5723     NVME_CMBSZ_SET_SQS(cmbsz, 1);
5724     NVME_CMBSZ_SET_CQS(cmbsz, 0);
5725     NVME_CMBSZ_SET_LISTS(cmbsz, 1);
5726     NVME_CMBSZ_SET_RDS(cmbsz, 1);
5727     NVME_CMBSZ_SET_WDS(cmbsz, 1);
5728     NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
5729     NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
5730     stl_le_p(&n->bar.cmbsz, cmbsz);
5731 }
5732 
5733 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
5734                            unsigned size)
5735 {
5736     uint64_t cap = ldq_le_p(&n->bar.cap);
5737     uint32_t cc = ldl_le_p(&n->bar.cc);
5738     uint32_t intms = ldl_le_p(&n->bar.intms);
5739     uint32_t csts = ldl_le_p(&n->bar.csts);
5740     uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
5741 
5742     if (unlikely(offset & (sizeof(uint32_t) - 1))) {
5743         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
5744                        "MMIO write not 32-bit aligned,"
5745                        " offset=0x%"PRIx64"", offset);
5746         /* should be ignored, fall through for now */
5747     }
5748 
5749     if (unlikely(size < sizeof(uint32_t))) {
5750         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
5751                        "MMIO write smaller than 32-bits,"
5752                        " offset=0x%"PRIx64", size=%u",
5753                        offset, size);
5754         /* should be ignored, fall through for now */
5755     }
5756 
5757     switch (offset) {
5758     case NVME_REG_INTMS:
5759         if (unlikely(msix_enabled(&(n->parent_obj)))) {
5760             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5761                            "undefined access to interrupt mask set"
5762                            " when MSI-X is enabled");
5763             /* should be ignored, fall through for now */
5764         }
5765         intms |= data;
5766         stl_le_p(&n->bar.intms, intms);
5767         n->bar.intmc = n->bar.intms;
5768         trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
5769         nvme_irq_check(n);
5770         break;
5771     case NVME_REG_INTMC:
5772         if (unlikely(msix_enabled(&(n->parent_obj)))) {
5773             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5774                            "undefined access to interrupt mask clr"
5775                            " when MSI-X is enabled");
5776             /* should be ignored, fall through for now */
5777         }
5778         intms &= ~data;
5779         stl_le_p(&n->bar.intms, intms);
5780         n->bar.intmc = n->bar.intms;
5781         trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
5782         nvme_irq_check(n);
5783         break;
5784     case NVME_REG_CC:
5785         trace_pci_nvme_mmio_cfg(data & 0xffffffff);
5786 
5787         /* Windows first sends data, then sends enable bit */
5788         if (!NVME_CC_EN(data) && !NVME_CC_EN(cc) &&
5789             !NVME_CC_SHN(data) && !NVME_CC_SHN(cc))
5790         {
5791             cc = data;
5792         }
5793 
5794         if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
5795             cc = data;
5796 
5797             /* flush CC since nvme_start_ctrl() needs the value */
5798             stl_le_p(&n->bar.cc, cc);
5799             if (unlikely(nvme_start_ctrl(n))) {
5800                 trace_pci_nvme_err_startfail();
5801                 csts = NVME_CSTS_FAILED;
5802             } else {
5803                 trace_pci_nvme_mmio_start_success();
5804                 csts = NVME_CSTS_READY;
5805             }
5806         } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
5807             trace_pci_nvme_mmio_stopped();
5808             nvme_ctrl_reset(n);
5809             cc = 0;
5810             csts &= ~NVME_CSTS_READY;
5811         }
5812 
5813         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
5814             trace_pci_nvme_mmio_shutdown_set();
5815             nvme_ctrl_shutdown(n);
5816             cc = data;
5817             csts |= NVME_CSTS_SHST_COMPLETE;
5818         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
5819             trace_pci_nvme_mmio_shutdown_cleared();
5820             csts &= ~NVME_CSTS_SHST_COMPLETE;
5821             cc = data;
5822         }
5823 
5824         stl_le_p(&n->bar.cc, cc);
5825         stl_le_p(&n->bar.csts, csts);
5826 
5827         break;
5828     case NVME_REG_CSTS:
5829         if (data & (1 << 4)) {
5830             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
5831                            "attempted to W1C CSTS.NSSRO"
5832                            " but CAP.NSSRS is zero (not supported)");
5833         } else if (data != 0) {
5834             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
5835                            "attempted to set a read only bit"
5836                            " of controller status");
5837         }
5838         break;
5839     case NVME_REG_NSSR:
5840         if (data == 0x4e564d65) {
5841             trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
5842         } else {
5843             /* The spec says that writes of other values have no effect */
5844             return;
5845         }
5846         break;
5847     case NVME_REG_AQA:
5848         stl_le_p(&n->bar.aqa, data);
5849         trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
5850         break;
5851     case NVME_REG_ASQ:
5852         stn_le_p(&n->bar.asq, size, data);
5853         trace_pci_nvme_mmio_asqaddr(data);
5854         break;
5855     case NVME_REG_ASQ + 4:
5856         stl_le_p((uint8_t *)&n->bar.asq + 4, data);
5857         trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
5858         break;
5859     case NVME_REG_ACQ:
5860         trace_pci_nvme_mmio_acqaddr(data);
5861         stn_le_p(&n->bar.acq, size, data);
5862         break;
5863     case NVME_REG_ACQ + 4:
5864         stl_le_p((uint8_t *)&n->bar.acq + 4, data);
5865         trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
5866         break;
5867     case NVME_REG_CMBLOC:
5868         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
5869                        "invalid write to reserved CMBLOC"
5870                        " when CMBSZ is zero, ignored");
5871         return;
5872     case NVME_REG_CMBSZ:
5873         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
5874                        "invalid write to read only CMBSZ, ignored");
5875         return;
5876     case NVME_REG_CMBMSC:
5877         if (!NVME_CAP_CMBS(cap)) {
5878             return;
5879         }
5880 
5881         stn_le_p(&n->bar.cmbmsc, size, data);
5882         n->cmb.cmse = false;
5883 
5884         if (NVME_CMBMSC_CRE(data)) {
5885             nvme_cmb_enable_regs(n);
5886 
5887             if (NVME_CMBMSC_CMSE(data)) {
5888                 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
5889                 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
5890                 if (cba + int128_get64(n->cmb.mem.size) < cba) {
5891                     uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
5892                     NVME_CMBSTS_SET_CBAI(cmbsts, 1);
5893                     stl_le_p(&n->bar.cmbsts, cmbsts);
5894                     return;
5895                 }
5896 
5897                 n->cmb.cba = cba;
5898                 n->cmb.cmse = true;
5899             }
5900         } else {
5901             n->bar.cmbsz = 0;
5902             n->bar.cmbloc = 0;
5903         }
5904 
5905         return;
5906     case NVME_REG_CMBMSC + 4:
5907         stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
5908         return;
5909 
5910     case NVME_REG_PMRCAP:
5911         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
5912                        "invalid write to PMRCAP register, ignored");
5913         return;
5914     case NVME_REG_PMRCTL:
5915         if (!NVME_CAP_PMRS(cap)) {
5916             return;
5917         }
5918 
5919         stl_le_p(&n->bar.pmrctl, data);
5920         if (NVME_PMRCTL_EN(data)) {
5921             memory_region_set_enabled(&n->pmr.dev->mr, true);
5922             pmrsts = 0;
5923         } else {
5924             memory_region_set_enabled(&n->pmr.dev->mr, false);
5925             NVME_PMRSTS_SET_NRDY(pmrsts, 1);
5926             n->pmr.cmse = false;
5927         }
5928         stl_le_p(&n->bar.pmrsts, pmrsts);
5929         return;
5930     case NVME_REG_PMRSTS:
5931         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
5932                        "invalid write to PMRSTS register, ignored");
5933         return;
5934     case NVME_REG_PMREBS:
5935         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
5936                        "invalid write to PMREBS register, ignored");
5937         return;
5938     case NVME_REG_PMRSWTP:
5939         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
5940                        "invalid write to PMRSWTP register, ignored");
5941         return;
5942     case NVME_REG_PMRMSCL:
5943         if (!NVME_CAP_PMRS(cap)) {
5944             return;
5945         }
5946 
5947         stl_le_p(&n->bar.pmrmscl, data);
5948         n->pmr.cmse = false;
5949 
5950         if (NVME_PMRMSCL_CMSE(data)) {
5951             uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
5952             hwaddr cba = pmrmscu << 32 |
5953                 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
5954             if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
5955                 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
5956                 stl_le_p(&n->bar.pmrsts, pmrsts);
5957                 return;
5958             }
5959 
5960             n->pmr.cmse = true;
5961             n->pmr.cba = cba;
5962         }
5963 
5964         return;
5965     case NVME_REG_PMRMSCU:
5966         if (!NVME_CAP_PMRS(cap)) {
5967             return;
5968         }
5969 
5970         stl_le_p(&n->bar.pmrmscu, data);
5971         return;
5972     default:
5973         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
5974                        "invalid MMIO write,"
5975                        " offset=0x%"PRIx64", data=%"PRIx64"",
5976                        offset, data);
5977         break;
5978     }
5979 }
5980 
5981 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
5982 {
5983     NvmeCtrl *n = (NvmeCtrl *)opaque;
5984     uint8_t *ptr = (uint8_t *)&n->bar;
5985 
5986     trace_pci_nvme_mmio_read(addr, size);
5987 
5988     if (unlikely(addr & (sizeof(uint32_t) - 1))) {
5989         NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
5990                        "MMIO read not 32-bit aligned,"
5991                        " offset=0x%"PRIx64"", addr);
5992         /* should RAZ, fall through for now */
5993     } else if (unlikely(size < sizeof(uint32_t))) {
5994         NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
5995                        "MMIO read smaller than 32-bits,"
5996                        " offset=0x%"PRIx64"", addr);
5997         /* should RAZ, fall through for now */
5998     }
5999 
6000     if (addr > sizeof(n->bar) - size) {
6001         NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
6002                        "MMIO read beyond last register,"
6003                        " offset=0x%"PRIx64", returning 0", addr);
6004 
6005         return 0;
6006     }
6007 
6008     /*
6009      * When PMRWBM bit 1 is set then read from
6010      * from PMRSTS should ensure prior writes
6011      * made it to persistent media
6012      */
6013     if (addr == NVME_REG_PMRSTS &&
6014         (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
6015         memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
6016     }
6017 
6018     return ldn_le_p(ptr + addr, size);
6019 }
6020 
6021 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
6022 {
6023     uint32_t qid;
6024 
6025     if (unlikely(addr & ((1 << 2) - 1))) {
6026         NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
6027                        "doorbell write not 32-bit aligned,"
6028                        " offset=0x%"PRIx64", ignoring", addr);
6029         return;
6030     }
6031 
6032     if (((addr - 0x1000) >> 2) & 1) {
6033         /* Completion queue doorbell write */
6034 
6035         uint16_t new_head = val & 0xffff;
6036         int start_sqs;
6037         NvmeCQueue *cq;
6038 
6039         qid = (addr - (0x1000 + (1 << 2))) >> 3;
6040         if (unlikely(nvme_check_cqid(n, qid))) {
6041             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
6042                            "completion queue doorbell write"
6043                            " for nonexistent queue,"
6044                            " sqid=%"PRIu32", ignoring", qid);
6045 
6046             /*
6047              * NVM Express v1.3d, Section 4.1 state: "If host software writes
6048              * an invalid value to the Submission Queue Tail Doorbell or
6049              * Completion Queue Head Doorbell regiter and an Asynchronous Event
6050              * Request command is outstanding, then an asynchronous event is
6051              * posted to the Admin Completion Queue with a status code of
6052              * Invalid Doorbell Write Value."
6053              *
6054              * Also note that the spec includes the "Invalid Doorbell Register"
6055              * status code, but nowhere does it specify when to use it.
6056              * However, it seems reasonable to use it here in a similar
6057              * fashion.
6058              */
6059             if (n->outstanding_aers) {
6060                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6061                                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6062                                    NVME_LOG_ERROR_INFO);
6063             }
6064 
6065             return;
6066         }
6067 
6068         cq = n->cq[qid];
6069         if (unlikely(new_head >= cq->size)) {
6070             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
6071                            "completion queue doorbell write value"
6072                            " beyond queue size, sqid=%"PRIu32","
6073                            " new_head=%"PRIu16", ignoring",
6074                            qid, new_head);
6075 
6076             if (n->outstanding_aers) {
6077                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6078                                    NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6079                                    NVME_LOG_ERROR_INFO);
6080             }
6081 
6082             return;
6083         }
6084 
6085         trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
6086 
6087         start_sqs = nvme_cq_full(cq) ? 1 : 0;
6088         cq->head = new_head;
6089         if (start_sqs) {
6090             NvmeSQueue *sq;
6091             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
6092                 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6093             }
6094             timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6095         }
6096 
6097         if (cq->tail == cq->head) {
6098             if (cq->irq_enabled) {
6099                 n->cq_pending--;
6100             }
6101 
6102             nvme_irq_deassert(n, cq);
6103         }
6104     } else {
6105         /* Submission queue doorbell write */
6106 
6107         uint16_t new_tail = val & 0xffff;
6108         NvmeSQueue *sq;
6109 
6110         qid = (addr - 0x1000) >> 3;
6111         if (unlikely(nvme_check_sqid(n, qid))) {
6112             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
6113                            "submission queue doorbell write"
6114                            " for nonexistent queue,"
6115                            " sqid=%"PRIu32", ignoring", qid);
6116 
6117             if (n->outstanding_aers) {
6118                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6119                                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6120                                    NVME_LOG_ERROR_INFO);
6121             }
6122 
6123             return;
6124         }
6125 
6126         sq = n->sq[qid];
6127         if (unlikely(new_tail >= sq->size)) {
6128             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
6129                            "submission queue doorbell write value"
6130                            " beyond queue size, sqid=%"PRIu32","
6131                            " new_tail=%"PRIu16", ignoring",
6132                            qid, new_tail);
6133 
6134             if (n->outstanding_aers) {
6135                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6136                                    NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6137                                    NVME_LOG_ERROR_INFO);
6138             }
6139 
6140             return;
6141         }
6142 
6143         trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
6144 
6145         sq->tail = new_tail;
6146         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6147     }
6148 }
6149 
6150 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
6151                             unsigned size)
6152 {
6153     NvmeCtrl *n = (NvmeCtrl *)opaque;
6154 
6155     trace_pci_nvme_mmio_write(addr, data, size);
6156 
6157     if (addr < sizeof(n->bar)) {
6158         nvme_write_bar(n, addr, data, size);
6159     } else {
6160         nvme_process_db(n, addr, data);
6161     }
6162 }
6163 
6164 static const MemoryRegionOps nvme_mmio_ops = {
6165     .read = nvme_mmio_read,
6166     .write = nvme_mmio_write,
6167     .endianness = DEVICE_LITTLE_ENDIAN,
6168     .impl = {
6169         .min_access_size = 2,
6170         .max_access_size = 8,
6171     },
6172 };
6173 
6174 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
6175                            unsigned size)
6176 {
6177     NvmeCtrl *n = (NvmeCtrl *)opaque;
6178     stn_le_p(&n->cmb.buf[addr], size, data);
6179 }
6180 
6181 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
6182 {
6183     NvmeCtrl *n = (NvmeCtrl *)opaque;
6184     return ldn_le_p(&n->cmb.buf[addr], size);
6185 }
6186 
6187 static const MemoryRegionOps nvme_cmb_ops = {
6188     .read = nvme_cmb_read,
6189     .write = nvme_cmb_write,
6190     .endianness = DEVICE_LITTLE_ENDIAN,
6191     .impl = {
6192         .min_access_size = 1,
6193         .max_access_size = 8,
6194     },
6195 };
6196 
6197 static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
6198 {
6199     NvmeParams *params = &n->params;
6200 
6201     if (params->num_queues) {
6202         warn_report("num_queues is deprecated; please use max_ioqpairs "
6203                     "instead");
6204 
6205         params->max_ioqpairs = params->num_queues - 1;
6206     }
6207 
6208     if (n->namespace.blkconf.blk && n->subsys) {
6209         error_setg(errp, "subsystem support is unavailable with legacy "
6210                    "namespace ('drive' property)");
6211         return;
6212     }
6213 
6214     if (params->max_ioqpairs < 1 ||
6215         params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
6216         error_setg(errp, "max_ioqpairs must be between 1 and %d",
6217                    NVME_MAX_IOQPAIRS);
6218         return;
6219     }
6220 
6221     if (params->msix_qsize < 1 ||
6222         params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
6223         error_setg(errp, "msix_qsize must be between 1 and %d",
6224                    PCI_MSIX_FLAGS_QSIZE + 1);
6225         return;
6226     }
6227 
6228     if (!params->serial) {
6229         error_setg(errp, "serial property not set");
6230         return;
6231     }
6232 
6233     if (n->pmr.dev) {
6234         if (host_memory_backend_is_mapped(n->pmr.dev)) {
6235             error_setg(errp, "can't use already busy memdev: %s",
6236                        object_get_canonical_path_component(OBJECT(n->pmr.dev)));
6237             return;
6238         }
6239 
6240         if (!is_power_of_2(n->pmr.dev->size)) {
6241             error_setg(errp, "pmr backend size needs to be power of 2 in size");
6242             return;
6243         }
6244 
6245         host_memory_backend_set_mapped(n->pmr.dev, true);
6246     }
6247 
6248     if (n->params.zasl > n->params.mdts) {
6249         error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
6250                    "than or equal to mdts (Maximum Data Transfer Size)");
6251         return;
6252     }
6253 
6254     if (!n->params.vsl) {
6255         error_setg(errp, "vsl must be non-zero");
6256         return;
6257     }
6258 }
6259 
6260 static void nvme_init_state(NvmeCtrl *n)
6261 {
6262     /* add one to max_ioqpairs to account for the admin queue pair */
6263     n->reg_size = pow2ceil(sizeof(NvmeBar) +
6264                            2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
6265     n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
6266     n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
6267     n->temperature = NVME_TEMPERATURE;
6268     n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
6269     n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6270     n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
6271 }
6272 
6273 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
6274 {
6275     uint64_t cmb_size = n->params.cmb_size_mb * MiB;
6276     uint64_t cap = ldq_le_p(&n->bar.cap);
6277 
6278     n->cmb.buf = g_malloc0(cmb_size);
6279     memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
6280                           "nvme-cmb", cmb_size);
6281     pci_register_bar(pci_dev, NVME_CMB_BIR,
6282                      PCI_BASE_ADDRESS_SPACE_MEMORY |
6283                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
6284                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
6285 
6286     NVME_CAP_SET_CMBS(cap, 1);
6287     stq_le_p(&n->bar.cap, cap);
6288 
6289     if (n->params.legacy_cmb) {
6290         nvme_cmb_enable_regs(n);
6291         n->cmb.cmse = true;
6292     }
6293 }
6294 
6295 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
6296 {
6297     uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
6298 
6299     NVME_PMRCAP_SET_RDS(pmrcap, 1);
6300     NVME_PMRCAP_SET_WDS(pmrcap, 1);
6301     NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
6302     /* Turn on bit 1 support */
6303     NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
6304     NVME_PMRCAP_SET_CMSS(pmrcap, 1);
6305     stl_le_p(&n->bar.pmrcap, pmrcap);
6306 
6307     pci_register_bar(pci_dev, NVME_PMR_BIR,
6308                      PCI_BASE_ADDRESS_SPACE_MEMORY |
6309                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
6310                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
6311 
6312     memory_region_set_enabled(&n->pmr.dev->mr, false);
6313 }
6314 
6315 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
6316 {
6317     uint8_t *pci_conf = pci_dev->config;
6318     uint64_t bar_size, msix_table_size, msix_pba_size;
6319     unsigned msix_table_offset, msix_pba_offset;
6320     int ret;
6321 
6322     Error *err = NULL;
6323 
6324     pci_conf[PCI_INTERRUPT_PIN] = 1;
6325     pci_config_set_prog_interface(pci_conf, 0x2);
6326 
6327     if (n->params.use_intel_id) {
6328         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
6329         pci_config_set_device_id(pci_conf, 0x5845);
6330     } else {
6331         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
6332         pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
6333     }
6334 
6335     pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
6336     pcie_endpoint_cap_init(pci_dev, 0x80);
6337 
6338     bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
6339     msix_table_offset = bar_size;
6340     msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
6341 
6342     bar_size += msix_table_size;
6343     bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
6344     msix_pba_offset = bar_size;
6345     msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
6346 
6347     bar_size += msix_pba_size;
6348     bar_size = pow2ceil(bar_size);
6349 
6350     memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
6351     memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
6352                           n->reg_size);
6353     memory_region_add_subregion(&n->bar0, 0, &n->iomem);
6354 
6355     pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
6356                      PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
6357     ret = msix_init(pci_dev, n->params.msix_qsize,
6358                     &n->bar0, 0, msix_table_offset,
6359                     &n->bar0, 0, msix_pba_offset, 0, &err);
6360     if (ret < 0) {
6361         if (ret == -ENOTSUP) {
6362             warn_report_err(err);
6363         } else {
6364             error_propagate(errp, err);
6365             return ret;
6366         }
6367     }
6368 
6369     if (n->params.cmb_size_mb) {
6370         nvme_init_cmb(n, pci_dev);
6371     }
6372 
6373     if (n->pmr.dev) {
6374         nvme_init_pmr(n, pci_dev);
6375     }
6376 
6377     return 0;
6378 }
6379 
6380 static void nvme_init_subnqn(NvmeCtrl *n)
6381 {
6382     NvmeSubsystem *subsys = n->subsys;
6383     NvmeIdCtrl *id = &n->id_ctrl;
6384 
6385     if (!subsys) {
6386         snprintf((char *)id->subnqn, sizeof(id->subnqn),
6387                  "nqn.2019-08.org.qemu:%s", n->params.serial);
6388     } else {
6389         pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
6390     }
6391 }
6392 
6393 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
6394 {
6395     NvmeIdCtrl *id = &n->id_ctrl;
6396     uint8_t *pci_conf = pci_dev->config;
6397     uint64_t cap = ldq_le_p(&n->bar.cap);
6398 
6399     id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
6400     id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
6401     strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
6402     strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
6403     strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
6404 
6405     id->cntlid = cpu_to_le16(n->cntlid);
6406 
6407     id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
6408 
6409     id->rab = 6;
6410 
6411     if (n->params.use_intel_id) {
6412         id->ieee[0] = 0xb3;
6413         id->ieee[1] = 0x02;
6414         id->ieee[2] = 0x00;
6415     } else {
6416         id->ieee[0] = 0x00;
6417         id->ieee[1] = 0x54;
6418         id->ieee[2] = 0x52;
6419     }
6420 
6421     id->mdts = n->params.mdts;
6422     id->ver = cpu_to_le32(NVME_SPEC_VER);
6423     id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
6424     id->cntrltype = 0x1;
6425 
6426     /*
6427      * Because the controller always completes the Abort command immediately,
6428      * there can never be more than one concurrently executing Abort command,
6429      * so this value is never used for anything. Note that there can easily be
6430      * many Abort commands in the queues, but they are not considered
6431      * "executing" until processed by nvme_abort.
6432      *
6433      * The specification recommends a value of 3 for Abort Command Limit (four
6434      * concurrently outstanding Abort commands), so lets use that though it is
6435      * inconsequential.
6436      */
6437     id->acl = 3;
6438     id->aerl = n->params.aerl;
6439     id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
6440     id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
6441 
6442     /* recommended default value (~70 C) */
6443     id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
6444     id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
6445 
6446     id->sqes = (0x6 << 4) | 0x6;
6447     id->cqes = (0x4 << 4) | 0x4;
6448     id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
6449     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
6450                            NVME_ONCS_FEATURES | NVME_ONCS_DSM |
6451                            NVME_ONCS_COMPARE | NVME_ONCS_COPY);
6452 
6453     /*
6454      * NOTE: If this device ever supports a command set that does NOT use 0x0
6455      * as a Flush-equivalent operation, support for the broadcast NSID in Flush
6456      * should probably be removed.
6457      *
6458      * See comment in nvme_io_cmd.
6459      */
6460     id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
6461 
6462     id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
6463     id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
6464                            NVME_CTRL_SGLS_BITBUCKET);
6465 
6466     nvme_init_subnqn(n);
6467 
6468     id->psd[0].mp = cpu_to_le16(0x9c4);
6469     id->psd[0].enlat = cpu_to_le32(0x10);
6470     id->psd[0].exlat = cpu_to_le32(0x4);
6471 
6472     if (n->subsys) {
6473         id->cmic |= NVME_CMIC_MULTI_CTRL;
6474     }
6475 
6476     NVME_CAP_SET_MQES(cap, 0x7ff);
6477     NVME_CAP_SET_CQR(cap, 1);
6478     NVME_CAP_SET_TO(cap, 0xf);
6479     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
6480     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
6481     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
6482     NVME_CAP_SET_MPSMAX(cap, 4);
6483     NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
6484     NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
6485     stq_le_p(&n->bar.cap, cap);
6486 
6487     stl_le_p(&n->bar.vs, NVME_SPEC_VER);
6488     n->bar.intmc = n->bar.intms = 0;
6489 }
6490 
6491 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
6492 {
6493     int cntlid;
6494 
6495     if (!n->subsys) {
6496         return 0;
6497     }
6498 
6499     cntlid = nvme_subsys_register_ctrl(n, errp);
6500     if (cntlid < 0) {
6501         return -1;
6502     }
6503 
6504     n->cntlid = cntlid;
6505 
6506     return 0;
6507 }
6508 
6509 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6510 {
6511     uint32_t nsid = ns->params.nsid;
6512     assert(nsid && nsid <= NVME_MAX_NAMESPACES);
6513 
6514     n->namespaces[nsid] = ns;
6515     ns->attached++;
6516 
6517     n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6518                             BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6519 }
6520 
6521 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
6522 {
6523     NvmeCtrl *n = NVME(pci_dev);
6524     NvmeNamespace *ns;
6525     Error *local_err = NULL;
6526 
6527     nvme_check_constraints(n, &local_err);
6528     if (local_err) {
6529         error_propagate(errp, local_err);
6530         return;
6531     }
6532 
6533     qbus_create_inplace(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
6534                         &pci_dev->qdev, n->parent_obj.qdev.id);
6535 
6536     nvme_init_state(n);
6537     if (nvme_init_pci(n, pci_dev, errp)) {
6538         return;
6539     }
6540 
6541     if (nvme_init_subsys(n, errp)) {
6542         error_propagate(errp, local_err);
6543         return;
6544     }
6545     nvme_init_ctrl(n, pci_dev);
6546 
6547     /* setup a namespace if the controller drive property was given */
6548     if (n->namespace.blkconf.blk) {
6549         ns = &n->namespace;
6550         ns->params.nsid = 1;
6551 
6552         if (nvme_ns_setup(ns, errp)) {
6553             return;
6554         }
6555 
6556         nvme_attach_ns(n, ns);
6557     }
6558 }
6559 
6560 static void nvme_exit(PCIDevice *pci_dev)
6561 {
6562     NvmeCtrl *n = NVME(pci_dev);
6563     NvmeNamespace *ns;
6564     int i;
6565 
6566     nvme_ctrl_reset(n);
6567 
6568     if (n->subsys) {
6569         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6570             ns = nvme_ns(n, i);
6571             if (ns) {
6572                 ns->attached--;
6573             }
6574         }
6575 
6576         nvme_subsys_unregister_ctrl(n->subsys, n);
6577     }
6578 
6579     g_free(n->cq);
6580     g_free(n->sq);
6581     g_free(n->aer_reqs);
6582 
6583     if (n->params.cmb_size_mb) {
6584         g_free(n->cmb.buf);
6585     }
6586 
6587     if (n->pmr.dev) {
6588         host_memory_backend_set_mapped(n->pmr.dev, false);
6589     }
6590     msix_uninit(pci_dev, &n->bar0, &n->bar0);
6591     memory_region_del_subregion(&n->bar0, &n->iomem);
6592 }
6593 
6594 static Property nvme_props[] = {
6595     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
6596     DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
6597                      HostMemoryBackend *),
6598     DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
6599                      NvmeSubsystem *),
6600     DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
6601     DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
6602     DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
6603     DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6604     DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
6605     DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
6606     DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
6607     DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
6608     DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
6609     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
6610     DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
6611     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
6612     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
6613                      params.auto_transition_zones, true),
6614     DEFINE_PROP_END_OF_LIST(),
6615 };
6616 
6617 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
6618                                    void *opaque, Error **errp)
6619 {
6620     NvmeCtrl *n = NVME(obj);
6621     uint8_t value = n->smart_critical_warning;
6622 
6623     visit_type_uint8(v, name, &value, errp);
6624 }
6625 
6626 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
6627                                    void *opaque, Error **errp)
6628 {
6629     NvmeCtrl *n = NVME(obj);
6630     uint8_t value, old_value, cap = 0, index, event;
6631 
6632     if (!visit_type_uint8(v, name, &value, errp)) {
6633         return;
6634     }
6635 
6636     cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
6637           | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
6638     if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
6639         cap |= NVME_SMART_PMR_UNRELIABLE;
6640     }
6641 
6642     if ((value & cap) != value) {
6643         error_setg(errp, "unsupported smart critical warning bits: 0x%x",
6644                    value & ~cap);
6645         return;
6646     }
6647 
6648     old_value = n->smart_critical_warning;
6649     n->smart_critical_warning = value;
6650 
6651     /* only inject new bits of smart critical warning */
6652     for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
6653         event = 1 << index;
6654         if (value & ~old_value & event)
6655             nvme_smart_event(n, event);
6656     }
6657 }
6658 
6659 static const VMStateDescription nvme_vmstate = {
6660     .name = "nvme",
6661     .unmigratable = 1,
6662 };
6663 
6664 static void nvme_class_init(ObjectClass *oc, void *data)
6665 {
6666     DeviceClass *dc = DEVICE_CLASS(oc);
6667     PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
6668 
6669     pc->realize = nvme_realize;
6670     pc->exit = nvme_exit;
6671     pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
6672     pc->revision = 2;
6673 
6674     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
6675     dc->desc = "Non-Volatile Memory Express";
6676     device_class_set_props(dc, nvme_props);
6677     dc->vmsd = &nvme_vmstate;
6678 }
6679 
6680 static void nvme_instance_init(Object *obj)
6681 {
6682     NvmeCtrl *n = NVME(obj);
6683 
6684     device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
6685                                   "bootindex", "/namespace@1,0",
6686                                   DEVICE(obj));
6687 
6688     object_property_add(obj, "smart_critical_warning", "uint8",
6689                         nvme_get_smart_warning,
6690                         nvme_set_smart_warning, NULL, NULL);
6691 }
6692 
6693 static const TypeInfo nvme_info = {
6694     .name          = TYPE_NVME,
6695     .parent        = TYPE_PCI_DEVICE,
6696     .instance_size = sizeof(NvmeCtrl),
6697     .instance_init = nvme_instance_init,
6698     .class_init    = nvme_class_init,
6699     .interfaces = (InterfaceInfo[]) {
6700         { INTERFACE_PCIE_DEVICE },
6701         { }
6702     },
6703 };
6704 
6705 static const TypeInfo nvme_bus_info = {
6706     .name = TYPE_NVME_BUS,
6707     .parent = TYPE_BUS,
6708     .instance_size = sizeof(NvmeBus),
6709 };
6710 
6711 static void nvme_register_types(void)
6712 {
6713     type_register_static(&nvme_info);
6714     type_register_static(&nvme_bus_info);
6715 }
6716 
6717 type_init(nvme_register_types)
6718