xref: /openbmc/qemu/hw/nvme/ctrl.c (revision 91654e61)
1 /*
2  * QEMU NVM Express Controller
3  *
4  * Copyright (c) 2012, Intel Corporation
5  *
6  * Written by Keith Busch <keith.busch@intel.com>
7  *
8  * This code is licensed under the GNU GPL v2 or later.
9  */
10 
11 /**
12  * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13  *
14  *  https://nvmexpress.org/developers/nvme-specification/
15  *
16  *
17  * Notes on coding style
18  * ---------------------
19  * While QEMU coding style prefers lowercase hexadecimals in constants, the
20  * NVMe subsystem use thes format from the NVMe specifications in the comments
21  * (i.e. 'h' suffix instead of '0x' prefix).
22  *
23  * Usage
24  * -----
25  * See docs/system/nvme.rst for extensive documentation.
26  *
27  * Add options:
28  *      -drive file=<file>,if=none,id=<drive_id>
29  *      -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30  *      -device nvme,serial=<serial>,id=<bus_name>, \
31  *              cmb_size_mb=<cmb_size_mb[optional]>, \
32  *              [pmrdev=<mem_backend_file_id>,] \
33  *              max_ioqpairs=<N[optional]>, \
34  *              aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35  *              mdts=<N[optional]>,vsl=<N[optional]>, \
36  *              zoned.zasl=<N[optional]>, \
37  *              zoned.auto_transition=<on|off[optional]>, \
38  *              subsys=<subsys_id>
39  *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
40  *              zoned=<true|false[optional]>, \
41  *              subsys=<subsys_id>,detached=<true|false[optional]>
42  *
43  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
44  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
45  * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
46  * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
47  *
48  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
49  * For example:
50  * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
51  *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
52  *
53  * The PMR will use BAR 4/5 exclusively.
54  *
55  * To place controller(s) and namespace(s) to a subsystem, then provide
56  * nvme-subsys device as above.
57  *
58  * nvme subsystem device parameters
59  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
60  * - `nqn`
61  *   This parameter provides the `<nqn_id>` part of the string
62  *   `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
63  *   of subsystem controllers. Note that `<nqn_id>` should be unique per
64  *   subsystem, but this is not enforced by QEMU. If not specified, it will
65  *   default to the value of the `id` parameter (`<subsys_id>`).
66  *
67  * nvme device parameters
68  * ~~~~~~~~~~~~~~~~~~~~~~
69  * - `subsys`
70  *   Specifying this parameter attaches the controller to the subsystem and
71  *   the SUBNQN field in the controller will report the NQN of the subsystem
72  *   device. This also enables multi controller capability represented in
73  *   Identify Controller data structure in CMIC (Controller Multi-path I/O and
74  *   Namesapce Sharing Capabilities).
75  *
76  * - `aerl`
77  *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
78  *   of concurrently outstanding Asynchronous Event Request commands support
79  *   by the controller. This is a 0's based value.
80  *
81  * - `aer_max_queued`
82  *   This is the maximum number of events that the device will enqueue for
83  *   completion when there are no outstanding AERs. When the maximum number of
84  *   enqueued events are reached, subsequent events will be dropped.
85  *
86  * - `mdts`
87  *   Indicates the maximum data transfer size for a command that transfers data
88  *   between host-accessible memory and the controller. The value is specified
89  *   as a power of two (2^n) and is in units of the minimum memory page size
90  *   (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
91  *
92  * - `vsl`
93  *   Indicates the maximum data size limit for the Verify command. Like `mdts`,
94  *   this value is specified as a power of two (2^n) and is in units of the
95  *   minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
96  *   KiB).
97  *
98  * - `zoned.zasl`
99  *   Indicates the maximum data transfer size for the Zone Append command. Like
100  *   `mdts`, the value is specified as a power of two (2^n) and is in units of
101  *   the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
102  *   defaulting to the value of `mdts`).
103  *
104  * - `zoned.auto_transition`
105  *   Indicates if zones in zone state implicitly opened can be automatically
106  *   transitioned to zone state closed for resource management purposes.
107  *   Defaults to 'on'.
108  *
109  * nvme namespace device parameters
110  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
111  * - `shared`
112  *   When the parent nvme device (as defined explicitly by the 'bus' parameter
113  *   or implicitly by the most recently defined NvmeBus) is linked to an
114  *   nvme-subsys device, the namespace will be attached to all controllers in
115  *   the subsystem. If set to 'off' (the default), the namespace will remain a
116  *   private namespace and may only be attached to a single controller at a
117  *   time.
118  *
119  * - `detached`
120  *   This parameter is only valid together with the `subsys` parameter. If left
121  *   at the default value (`false/off`), the namespace will be attached to all
122  *   controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
123  *   namespace will be available in the subsystem but not attached to any
124  *   controllers.
125  *
126  * Setting `zoned` to true selects Zoned Command Set at the namespace.
127  * In this case, the following namespace properties are available to configure
128  * zoned operation:
129  *     zoned.zone_size=<zone size in bytes, default: 128MiB>
130  *         The number may be followed by K, M, G as in kilo-, mega- or giga-.
131  *
132  *     zoned.zone_capacity=<zone capacity in bytes, default: zone size>
133  *         The value 0 (default) forces zone capacity to be the same as zone
134  *         size. The value of this property may not exceed zone size.
135  *
136  *     zoned.descr_ext_size=<zone descriptor extension size, default 0>
137  *         This value needs to be specified in 64B units. If it is zero,
138  *         namespace(s) will not support zone descriptor extensions.
139  *
140  *     zoned.max_active=<Maximum Active Resources (zones), default: 0>
141  *         The default value means there is no limit to the number of
142  *         concurrently active zones.
143  *
144  *     zoned.max_open=<Maximum Open Resources (zones), default: 0>
145  *         The default value means there is no limit to the number of
146  *         concurrently open zones.
147  *
148  *     zoned.cross_read=<enable RAZB, default: false>
149  *         Setting this property to true enables Read Across Zone Boundaries.
150  */
151 
152 #include "qemu/osdep.h"
153 #include "qemu/cutils.h"
154 #include "qemu/error-report.h"
155 #include "qemu/log.h"
156 #include "qemu/units.h"
157 #include "qapi/error.h"
158 #include "qapi/visitor.h"
159 #include "sysemu/sysemu.h"
160 #include "sysemu/block-backend.h"
161 #include "sysemu/hostmem.h"
162 #include "hw/pci/msix.h"
163 #include "migration/vmstate.h"
164 
165 #include "nvme.h"
166 #include "trace.h"
167 
168 #define NVME_MAX_IOQPAIRS 0xffff
169 #define NVME_DB_SIZE  4
170 #define NVME_SPEC_VER 0x00010400
171 #define NVME_CMB_BIR 2
172 #define NVME_PMR_BIR 4
173 #define NVME_TEMPERATURE 0x143
174 #define NVME_TEMPERATURE_WARNING 0x157
175 #define NVME_TEMPERATURE_CRITICAL 0x175
176 #define NVME_NUM_FW_SLOTS 1
177 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
178 
179 #define NVME_GUEST_ERR(trace, fmt, ...) \
180     do { \
181         (trace_##trace)(__VA_ARGS__); \
182         qemu_log_mask(LOG_GUEST_ERROR, #trace \
183             " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
184     } while (0)
185 
186 static const bool nvme_feature_support[NVME_FID_MAX] = {
187     [NVME_ARBITRATION]              = true,
188     [NVME_POWER_MANAGEMENT]         = true,
189     [NVME_TEMPERATURE_THRESHOLD]    = true,
190     [NVME_ERROR_RECOVERY]           = true,
191     [NVME_VOLATILE_WRITE_CACHE]     = true,
192     [NVME_NUMBER_OF_QUEUES]         = true,
193     [NVME_INTERRUPT_COALESCING]     = true,
194     [NVME_INTERRUPT_VECTOR_CONF]    = true,
195     [NVME_WRITE_ATOMICITY]          = true,
196     [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
197     [NVME_TIMESTAMP]                = true,
198     [NVME_COMMAND_SET_PROFILE]      = true,
199 };
200 
201 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
202     [NVME_TEMPERATURE_THRESHOLD]    = NVME_FEAT_CAP_CHANGE,
203     [NVME_ERROR_RECOVERY]           = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
204     [NVME_VOLATILE_WRITE_CACHE]     = NVME_FEAT_CAP_CHANGE,
205     [NVME_NUMBER_OF_QUEUES]         = NVME_FEAT_CAP_CHANGE,
206     [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
207     [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
208     [NVME_COMMAND_SET_PROFILE]      = NVME_FEAT_CAP_CHANGE,
209 };
210 
211 static const uint32_t nvme_cse_acs[256] = {
212     [NVME_ADM_CMD_DELETE_SQ]        = NVME_CMD_EFF_CSUPP,
213     [NVME_ADM_CMD_CREATE_SQ]        = NVME_CMD_EFF_CSUPP,
214     [NVME_ADM_CMD_GET_LOG_PAGE]     = NVME_CMD_EFF_CSUPP,
215     [NVME_ADM_CMD_DELETE_CQ]        = NVME_CMD_EFF_CSUPP,
216     [NVME_ADM_CMD_CREATE_CQ]        = NVME_CMD_EFF_CSUPP,
217     [NVME_ADM_CMD_IDENTIFY]         = NVME_CMD_EFF_CSUPP,
218     [NVME_ADM_CMD_ABORT]            = NVME_CMD_EFF_CSUPP,
219     [NVME_ADM_CMD_SET_FEATURES]     = NVME_CMD_EFF_CSUPP,
220     [NVME_ADM_CMD_GET_FEATURES]     = NVME_CMD_EFF_CSUPP,
221     [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
222     [NVME_ADM_CMD_NS_ATTACHMENT]    = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
223     [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
224 };
225 
226 static const uint32_t nvme_cse_iocs_none[256];
227 
228 static const uint32_t nvme_cse_iocs_nvm[256] = {
229     [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
230     [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
231     [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
232     [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
233     [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
234     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
235     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
236     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
237 };
238 
239 static const uint32_t nvme_cse_iocs_zoned[256] = {
240     [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
241     [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
242     [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
243     [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
244     [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
245     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
246     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
247     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
248     [NVME_CMD_ZONE_APPEND]          = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
249     [NVME_CMD_ZONE_MGMT_SEND]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
250     [NVME_CMD_ZONE_MGMT_RECV]       = NVME_CMD_EFF_CSUPP,
251 };
252 
253 static void nvme_process_sq(void *opaque);
254 
255 static uint16_t nvme_sqid(NvmeRequest *req)
256 {
257     return le16_to_cpu(req->sq->sqid);
258 }
259 
260 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
261                                    NvmeZoneState state)
262 {
263     if (QTAILQ_IN_USE(zone, entry)) {
264         switch (nvme_get_zone_state(zone)) {
265         case NVME_ZONE_STATE_EXPLICITLY_OPEN:
266             QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
267             break;
268         case NVME_ZONE_STATE_IMPLICITLY_OPEN:
269             QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
270             break;
271         case NVME_ZONE_STATE_CLOSED:
272             QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
273             break;
274         case NVME_ZONE_STATE_FULL:
275             QTAILQ_REMOVE(&ns->full_zones, zone, entry);
276         default:
277             ;
278         }
279     }
280 
281     nvme_set_zone_state(zone, state);
282 
283     switch (state) {
284     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
285         QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
286         break;
287     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
288         QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
289         break;
290     case NVME_ZONE_STATE_CLOSED:
291         QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
292         break;
293     case NVME_ZONE_STATE_FULL:
294         QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
295     case NVME_ZONE_STATE_READ_ONLY:
296         break;
297     default:
298         zone->d.za = 0;
299     }
300 }
301 
302 /*
303  * Check if we can open a zone without exceeding open/active limits.
304  * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
305  */
306 static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
307 {
308     if (ns->params.max_active_zones != 0 &&
309         ns->nr_active_zones + act > ns->params.max_active_zones) {
310         trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
311         return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
312     }
313     if (ns->params.max_open_zones != 0 &&
314         ns->nr_open_zones + opn > ns->params.max_open_zones) {
315         trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
316         return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
317     }
318 
319     return NVME_SUCCESS;
320 }
321 
322 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
323 {
324     hwaddr hi, lo;
325 
326     if (!n->cmb.cmse) {
327         return false;
328     }
329 
330     lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
331     hi = lo + int128_get64(n->cmb.mem.size);
332 
333     return addr >= lo && addr < hi;
334 }
335 
336 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
337 {
338     hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
339     return &n->cmb.buf[addr - base];
340 }
341 
342 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
343 {
344     hwaddr hi;
345 
346     if (!n->pmr.cmse) {
347         return false;
348     }
349 
350     hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
351 
352     return addr >= n->pmr.cba && addr < hi;
353 }
354 
355 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
356 {
357     return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
358 }
359 
360 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
361 {
362     hwaddr hi = addr + size - 1;
363     if (hi < addr) {
364         return 1;
365     }
366 
367     if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
368         memcpy(buf, nvme_addr_to_cmb(n, addr), size);
369         return 0;
370     }
371 
372     if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
373         memcpy(buf, nvme_addr_to_pmr(n, addr), size);
374         return 0;
375     }
376 
377     return pci_dma_read(&n->parent_obj, addr, buf, size);
378 }
379 
380 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
381 {
382     hwaddr hi = addr + size - 1;
383     if (hi < addr) {
384         return 1;
385     }
386 
387     if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
388         memcpy(nvme_addr_to_cmb(n, addr), buf, size);
389         return 0;
390     }
391 
392     if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
393         memcpy(nvme_addr_to_pmr(n, addr), buf, size);
394         return 0;
395     }
396 
397     return pci_dma_write(&n->parent_obj, addr, buf, size);
398 }
399 
400 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
401 {
402     return nsid &&
403         (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
404 }
405 
406 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
407 {
408     return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
409 }
410 
411 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
412 {
413     return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
414 }
415 
416 static void nvme_inc_cq_tail(NvmeCQueue *cq)
417 {
418     cq->tail++;
419     if (cq->tail >= cq->size) {
420         cq->tail = 0;
421         cq->phase = !cq->phase;
422     }
423 }
424 
425 static void nvme_inc_sq_head(NvmeSQueue *sq)
426 {
427     sq->head = (sq->head + 1) % sq->size;
428 }
429 
430 static uint8_t nvme_cq_full(NvmeCQueue *cq)
431 {
432     return (cq->tail + 1) % cq->size == cq->head;
433 }
434 
435 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
436 {
437     return sq->head == sq->tail;
438 }
439 
440 static void nvme_irq_check(NvmeCtrl *n)
441 {
442     uint32_t intms = ldl_le_p(&n->bar.intms);
443 
444     if (msix_enabled(&(n->parent_obj))) {
445         return;
446     }
447     if (~intms & n->irq_status) {
448         pci_irq_assert(&n->parent_obj);
449     } else {
450         pci_irq_deassert(&n->parent_obj);
451     }
452 }
453 
454 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
455 {
456     if (cq->irq_enabled) {
457         if (msix_enabled(&(n->parent_obj))) {
458             trace_pci_nvme_irq_msix(cq->vector);
459             msix_notify(&(n->parent_obj), cq->vector);
460         } else {
461             trace_pci_nvme_irq_pin();
462             assert(cq->vector < 32);
463             n->irq_status |= 1 << cq->vector;
464             nvme_irq_check(n);
465         }
466     } else {
467         trace_pci_nvme_irq_masked();
468     }
469 }
470 
471 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
472 {
473     if (cq->irq_enabled) {
474         if (msix_enabled(&(n->parent_obj))) {
475             return;
476         } else {
477             assert(cq->vector < 32);
478             if (!n->cq_pending) {
479                 n->irq_status &= ~(1 << cq->vector);
480             }
481             nvme_irq_check(n);
482         }
483     }
484 }
485 
486 static void nvme_req_clear(NvmeRequest *req)
487 {
488     req->ns = NULL;
489     req->opaque = NULL;
490     req->aiocb = NULL;
491     memset(&req->cqe, 0x0, sizeof(req->cqe));
492     req->status = NVME_SUCCESS;
493 }
494 
495 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
496 {
497     if (dma) {
498         pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
499         sg->flags = NVME_SG_DMA;
500     } else {
501         qemu_iovec_init(&sg->iov, 0);
502     }
503 
504     sg->flags |= NVME_SG_ALLOC;
505 }
506 
507 static inline void nvme_sg_unmap(NvmeSg *sg)
508 {
509     if (!(sg->flags & NVME_SG_ALLOC)) {
510         return;
511     }
512 
513     if (sg->flags & NVME_SG_DMA) {
514         qemu_sglist_destroy(&sg->qsg);
515     } else {
516         qemu_iovec_destroy(&sg->iov);
517     }
518 
519     memset(sg, 0x0, sizeof(*sg));
520 }
521 
522 /*
523  * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
524  * holds both data and metadata. This function splits the data and metadata
525  * into two separate QSG/IOVs.
526  */
527 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
528                           NvmeSg *mdata)
529 {
530     NvmeSg *dst = data;
531     uint32_t trans_len, count = ns->lbasz;
532     uint64_t offset = 0;
533     bool dma = sg->flags & NVME_SG_DMA;
534     size_t sge_len;
535     size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
536     int sg_idx = 0;
537 
538     assert(sg->flags & NVME_SG_ALLOC);
539 
540     while (sg_len) {
541         sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
542 
543         trans_len = MIN(sg_len, count);
544         trans_len = MIN(trans_len, sge_len - offset);
545 
546         if (dst) {
547             if (dma) {
548                 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
549                                 trans_len);
550             } else {
551                 qemu_iovec_add(&dst->iov,
552                                sg->iov.iov[sg_idx].iov_base + offset,
553                                trans_len);
554             }
555         }
556 
557         sg_len -= trans_len;
558         count -= trans_len;
559         offset += trans_len;
560 
561         if (count == 0) {
562             dst = (dst == data) ? mdata : data;
563             count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
564         }
565 
566         if (sge_len == offset) {
567             offset = 0;
568             sg_idx++;
569         }
570     }
571 }
572 
573 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
574                                   size_t len)
575 {
576     if (!len) {
577         return NVME_SUCCESS;
578     }
579 
580     trace_pci_nvme_map_addr_cmb(addr, len);
581 
582     if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
583         return NVME_DATA_TRAS_ERROR;
584     }
585 
586     qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
587 
588     return NVME_SUCCESS;
589 }
590 
591 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
592                                   size_t len)
593 {
594     if (!len) {
595         return NVME_SUCCESS;
596     }
597 
598     if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
599         return NVME_DATA_TRAS_ERROR;
600     }
601 
602     qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
603 
604     return NVME_SUCCESS;
605 }
606 
607 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
608 {
609     bool cmb = false, pmr = false;
610 
611     if (!len) {
612         return NVME_SUCCESS;
613     }
614 
615     trace_pci_nvme_map_addr(addr, len);
616 
617     if (nvme_addr_is_cmb(n, addr)) {
618         cmb = true;
619     } else if (nvme_addr_is_pmr(n, addr)) {
620         pmr = true;
621     }
622 
623     if (cmb || pmr) {
624         if (sg->flags & NVME_SG_DMA) {
625             return NVME_INVALID_USE_OF_CMB | NVME_DNR;
626         }
627 
628         if (sg->iov.niov + 1 > IOV_MAX) {
629             goto max_mappings_exceeded;
630         }
631 
632         if (cmb) {
633             return nvme_map_addr_cmb(n, &sg->iov, addr, len);
634         } else {
635             return nvme_map_addr_pmr(n, &sg->iov, addr, len);
636         }
637     }
638 
639     if (!(sg->flags & NVME_SG_DMA)) {
640         return NVME_INVALID_USE_OF_CMB | NVME_DNR;
641     }
642 
643     if (sg->qsg.nsg + 1 > IOV_MAX) {
644         goto max_mappings_exceeded;
645     }
646 
647     qemu_sglist_add(&sg->qsg, addr, len);
648 
649     return NVME_SUCCESS;
650 
651 max_mappings_exceeded:
652     NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
653                    "number of mappings exceed 1024");
654     return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
655 }
656 
657 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
658 {
659     return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
660 }
661 
662 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
663                              uint64_t prp2, uint32_t len)
664 {
665     hwaddr trans_len = n->page_size - (prp1 % n->page_size);
666     trans_len = MIN(len, trans_len);
667     int num_prps = (len >> n->page_bits) + 1;
668     uint16_t status;
669     int ret;
670 
671     trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
672 
673     nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
674 
675     status = nvme_map_addr(n, sg, prp1, trans_len);
676     if (status) {
677         goto unmap;
678     }
679 
680     len -= trans_len;
681     if (len) {
682         if (len > n->page_size) {
683             uint64_t prp_list[n->max_prp_ents];
684             uint32_t nents, prp_trans;
685             int i = 0;
686 
687             /*
688              * The first PRP list entry, pointed to by PRP2 may contain offset.
689              * Hence, we need to calculate the number of entries in based on
690              * that offset.
691              */
692             nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
693             prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
694             ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
695             if (ret) {
696                 trace_pci_nvme_err_addr_read(prp2);
697                 status = NVME_DATA_TRAS_ERROR;
698                 goto unmap;
699             }
700             while (len != 0) {
701                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
702 
703                 if (i == nents - 1 && len > n->page_size) {
704                     if (unlikely(prp_ent & (n->page_size - 1))) {
705                         trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
706                         status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
707                         goto unmap;
708                     }
709 
710                     i = 0;
711                     nents = (len + n->page_size - 1) >> n->page_bits;
712                     nents = MIN(nents, n->max_prp_ents);
713                     prp_trans = nents * sizeof(uint64_t);
714                     ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
715                                          prp_trans);
716                     if (ret) {
717                         trace_pci_nvme_err_addr_read(prp_ent);
718                         status = NVME_DATA_TRAS_ERROR;
719                         goto unmap;
720                     }
721                     prp_ent = le64_to_cpu(prp_list[i]);
722                 }
723 
724                 if (unlikely(prp_ent & (n->page_size - 1))) {
725                     trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
726                     status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
727                     goto unmap;
728                 }
729 
730                 trans_len = MIN(len, n->page_size);
731                 status = nvme_map_addr(n, sg, prp_ent, trans_len);
732                 if (status) {
733                     goto unmap;
734                 }
735 
736                 len -= trans_len;
737                 i++;
738             }
739         } else {
740             if (unlikely(prp2 & (n->page_size - 1))) {
741                 trace_pci_nvme_err_invalid_prp2_align(prp2);
742                 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
743                 goto unmap;
744             }
745             status = nvme_map_addr(n, sg, prp2, len);
746             if (status) {
747                 goto unmap;
748             }
749         }
750     }
751 
752     return NVME_SUCCESS;
753 
754 unmap:
755     nvme_sg_unmap(sg);
756     return status;
757 }
758 
759 /*
760  * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
761  * number of bytes mapped in len.
762  */
763 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
764                                   NvmeSglDescriptor *segment, uint64_t nsgld,
765                                   size_t *len, NvmeCmd *cmd)
766 {
767     dma_addr_t addr, trans_len;
768     uint32_t dlen;
769     uint16_t status;
770 
771     for (int i = 0; i < nsgld; i++) {
772         uint8_t type = NVME_SGL_TYPE(segment[i].type);
773 
774         switch (type) {
775         case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
776             if (cmd->opcode == NVME_CMD_WRITE) {
777                 continue;
778             }
779         case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
780             break;
781         case NVME_SGL_DESCR_TYPE_SEGMENT:
782         case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
783             return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
784         default:
785             return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
786         }
787 
788         dlen = le32_to_cpu(segment[i].len);
789 
790         if (!dlen) {
791             continue;
792         }
793 
794         if (*len == 0) {
795             /*
796              * All data has been mapped, but the SGL contains additional
797              * segments and/or descriptors. The controller might accept
798              * ignoring the rest of the SGL.
799              */
800             uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
801             if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
802                 break;
803             }
804 
805             trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
806             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
807         }
808 
809         trans_len = MIN(*len, dlen);
810 
811         if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
812             goto next;
813         }
814 
815         addr = le64_to_cpu(segment[i].addr);
816 
817         if (UINT64_MAX - addr < dlen) {
818             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
819         }
820 
821         status = nvme_map_addr(n, sg, addr, trans_len);
822         if (status) {
823             return status;
824         }
825 
826 next:
827         *len -= trans_len;
828     }
829 
830     return NVME_SUCCESS;
831 }
832 
833 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
834                              size_t len, NvmeCmd *cmd)
835 {
836     /*
837      * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
838      * dynamically allocating a potentially huge SGL. The spec allows the SGL
839      * to be larger (as in number of bytes required to describe the SGL
840      * descriptors and segment chain) than the command transfer size, so it is
841      * not bounded by MDTS.
842      */
843     const int SEG_CHUNK_SIZE = 256;
844 
845     NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
846     uint64_t nsgld;
847     uint32_t seg_len;
848     uint16_t status;
849     hwaddr addr;
850     int ret;
851 
852     sgld = &sgl;
853     addr = le64_to_cpu(sgl.addr);
854 
855     trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
856 
857     nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
858 
859     /*
860      * If the entire transfer can be described with a single data block it can
861      * be mapped directly.
862      */
863     if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
864         status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
865         if (status) {
866             goto unmap;
867         }
868 
869         goto out;
870     }
871 
872     for (;;) {
873         switch (NVME_SGL_TYPE(sgld->type)) {
874         case NVME_SGL_DESCR_TYPE_SEGMENT:
875         case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
876             break;
877         default:
878             return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
879         }
880 
881         seg_len = le32_to_cpu(sgld->len);
882 
883         /* check the length of the (Last) Segment descriptor */
884         if ((!seg_len || seg_len & 0xf) &&
885             (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
886             return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
887         }
888 
889         if (UINT64_MAX - addr < seg_len) {
890             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
891         }
892 
893         nsgld = seg_len / sizeof(NvmeSglDescriptor);
894 
895         while (nsgld > SEG_CHUNK_SIZE) {
896             if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
897                 trace_pci_nvme_err_addr_read(addr);
898                 status = NVME_DATA_TRAS_ERROR;
899                 goto unmap;
900             }
901 
902             status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
903                                        &len, cmd);
904             if (status) {
905                 goto unmap;
906             }
907 
908             nsgld -= SEG_CHUNK_SIZE;
909             addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
910         }
911 
912         ret = nvme_addr_read(n, addr, segment, nsgld *
913                              sizeof(NvmeSglDescriptor));
914         if (ret) {
915             trace_pci_nvme_err_addr_read(addr);
916             status = NVME_DATA_TRAS_ERROR;
917             goto unmap;
918         }
919 
920         last_sgld = &segment[nsgld - 1];
921 
922         /*
923          * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
924          * then we are done.
925          */
926         switch (NVME_SGL_TYPE(last_sgld->type)) {
927         case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
928         case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
929             status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
930             if (status) {
931                 goto unmap;
932             }
933 
934             goto out;
935 
936         default:
937             break;
938         }
939 
940         /*
941          * If the last descriptor was not a Data Block or Bit Bucket, then the
942          * current segment must not be a Last Segment.
943          */
944         if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
945             status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
946             goto unmap;
947         }
948 
949         sgld = last_sgld;
950         addr = le64_to_cpu(sgld->addr);
951 
952         /*
953          * Do not map the last descriptor; it will be a Segment or Last Segment
954          * descriptor and is handled by the next iteration.
955          */
956         status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
957         if (status) {
958             goto unmap;
959         }
960     }
961 
962 out:
963     /* if there is any residual left in len, the SGL was too short */
964     if (len) {
965         status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
966         goto unmap;
967     }
968 
969     return NVME_SUCCESS;
970 
971 unmap:
972     nvme_sg_unmap(sg);
973     return status;
974 }
975 
976 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
977                        NvmeCmd *cmd)
978 {
979     uint64_t prp1, prp2;
980 
981     switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
982     case NVME_PSDT_PRP:
983         prp1 = le64_to_cpu(cmd->dptr.prp1);
984         prp2 = le64_to_cpu(cmd->dptr.prp2);
985 
986         return nvme_map_prp(n, sg, prp1, prp2, len);
987     case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
988     case NVME_PSDT_SGL_MPTR_SGL:
989         return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
990     default:
991         return NVME_INVALID_FIELD;
992     }
993 }
994 
995 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
996                               NvmeCmd *cmd)
997 {
998     int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
999     hwaddr mptr = le64_to_cpu(cmd->mptr);
1000     uint16_t status;
1001 
1002     if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1003         NvmeSglDescriptor sgl;
1004 
1005         if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1006             return NVME_DATA_TRAS_ERROR;
1007         }
1008 
1009         status = nvme_map_sgl(n, sg, sgl, len, cmd);
1010         if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1011             status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1012         }
1013 
1014         return status;
1015     }
1016 
1017     nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1018     status = nvme_map_addr(n, sg, mptr, len);
1019     if (status) {
1020         nvme_sg_unmap(sg);
1021     }
1022 
1023     return status;
1024 }
1025 
1026 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1027 {
1028     NvmeNamespace *ns = req->ns;
1029     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1030     bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1031     bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1032     size_t len = nvme_l2b(ns, nlb);
1033     uint16_t status;
1034 
1035     if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1036         NvmeSg sg;
1037 
1038         len += nvme_m2b(ns, nlb);
1039 
1040         status = nvme_map_dptr(n, &sg, len, &req->cmd);
1041         if (status) {
1042             return status;
1043         }
1044 
1045         nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1046         nvme_sg_split(&sg, ns, &req->sg, NULL);
1047         nvme_sg_unmap(&sg);
1048 
1049         return NVME_SUCCESS;
1050     }
1051 
1052     return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1053 }
1054 
1055 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1056 {
1057     NvmeNamespace *ns = req->ns;
1058     size_t len = nvme_m2b(ns, nlb);
1059     uint16_t status;
1060 
1061     if (nvme_ns_ext(ns)) {
1062         NvmeSg sg;
1063 
1064         len += nvme_l2b(ns, nlb);
1065 
1066         status = nvme_map_dptr(n, &sg, len, &req->cmd);
1067         if (status) {
1068             return status;
1069         }
1070 
1071         nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1072         nvme_sg_split(&sg, ns, NULL, &req->sg);
1073         nvme_sg_unmap(&sg);
1074 
1075         return NVME_SUCCESS;
1076     }
1077 
1078     return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1079 }
1080 
1081 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1082                                     uint32_t len, uint32_t bytes,
1083                                     int32_t skip_bytes, int64_t offset,
1084                                     NvmeTxDirection dir)
1085 {
1086     hwaddr addr;
1087     uint32_t trans_len, count = bytes;
1088     bool dma = sg->flags & NVME_SG_DMA;
1089     int64_t sge_len;
1090     int sg_idx = 0;
1091     int ret;
1092 
1093     assert(sg->flags & NVME_SG_ALLOC);
1094 
1095     while (len) {
1096         sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1097 
1098         if (sge_len - offset < 0) {
1099             offset -= sge_len;
1100             sg_idx++;
1101             continue;
1102         }
1103 
1104         if (sge_len == offset) {
1105             offset = 0;
1106             sg_idx++;
1107             continue;
1108         }
1109 
1110         trans_len = MIN(len, count);
1111         trans_len = MIN(trans_len, sge_len - offset);
1112 
1113         if (dma) {
1114             addr = sg->qsg.sg[sg_idx].base + offset;
1115         } else {
1116             addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1117         }
1118 
1119         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1120             ret = nvme_addr_read(n, addr, ptr, trans_len);
1121         } else {
1122             ret = nvme_addr_write(n, addr, ptr, trans_len);
1123         }
1124 
1125         if (ret) {
1126             return NVME_DATA_TRAS_ERROR;
1127         }
1128 
1129         ptr += trans_len;
1130         len -= trans_len;
1131         count -= trans_len;
1132         offset += trans_len;
1133 
1134         if (count == 0) {
1135             count = bytes;
1136             offset += skip_bytes;
1137         }
1138     }
1139 
1140     return NVME_SUCCESS;
1141 }
1142 
1143 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
1144                         NvmeTxDirection dir)
1145 {
1146     assert(sg->flags & NVME_SG_ALLOC);
1147 
1148     if (sg->flags & NVME_SG_DMA) {
1149         const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1150         dma_addr_t residual;
1151 
1152         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1153             dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1154         } else {
1155             dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1156         }
1157 
1158         if (unlikely(residual)) {
1159             trace_pci_nvme_err_invalid_dma();
1160             return NVME_INVALID_FIELD | NVME_DNR;
1161         }
1162     } else {
1163         size_t bytes;
1164 
1165         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1166             bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1167         } else {
1168             bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1169         }
1170 
1171         if (unlikely(bytes != len)) {
1172             trace_pci_nvme_err_invalid_dma();
1173             return NVME_INVALID_FIELD | NVME_DNR;
1174         }
1175     }
1176 
1177     return NVME_SUCCESS;
1178 }
1179 
1180 static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1181                                 NvmeRequest *req)
1182 {
1183     uint16_t status;
1184 
1185     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1186     if (status) {
1187         return status;
1188     }
1189 
1190     return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1191 }
1192 
1193 static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1194                                 NvmeRequest *req)
1195 {
1196     uint16_t status;
1197 
1198     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1199     if (status) {
1200         return status;
1201     }
1202 
1203     return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1204 }
1205 
1206 uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1207                           NvmeTxDirection dir, NvmeRequest *req)
1208 {
1209     NvmeNamespace *ns = req->ns;
1210     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1211     bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1212     bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1213 
1214     if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1215         return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1216                                    ns->lbaf.ms, 0, dir);
1217     }
1218 
1219     return nvme_tx(n, &req->sg, ptr, len, dir);
1220 }
1221 
1222 uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1223                            NvmeTxDirection dir, NvmeRequest *req)
1224 {
1225     NvmeNamespace *ns = req->ns;
1226     uint16_t status;
1227 
1228     if (nvme_ns_ext(ns)) {
1229         return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1230                                    ns->lbasz, ns->lbasz, dir);
1231     }
1232 
1233     nvme_sg_unmap(&req->sg);
1234 
1235     status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1236     if (status) {
1237         return status;
1238     }
1239 
1240     return nvme_tx(n, &req->sg, ptr, len, dir);
1241 }
1242 
1243 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1244                                  BlockCompletionFunc *cb, NvmeRequest *req)
1245 {
1246     assert(req->sg.flags & NVME_SG_ALLOC);
1247 
1248     if (req->sg.flags & NVME_SG_DMA) {
1249         req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1250                                   cb, req);
1251     } else {
1252         req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1253     }
1254 }
1255 
1256 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1257                                   BlockCompletionFunc *cb, NvmeRequest *req)
1258 {
1259     assert(req->sg.flags & NVME_SG_ALLOC);
1260 
1261     if (req->sg.flags & NVME_SG_DMA) {
1262         req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1263                                    cb, req);
1264     } else {
1265         req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1266     }
1267 }
1268 
1269 static void nvme_post_cqes(void *opaque)
1270 {
1271     NvmeCQueue *cq = opaque;
1272     NvmeCtrl *n = cq->ctrl;
1273     NvmeRequest *req, *next;
1274     bool pending = cq->head != cq->tail;
1275     int ret;
1276 
1277     QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1278         NvmeSQueue *sq;
1279         hwaddr addr;
1280 
1281         if (nvme_cq_full(cq)) {
1282             break;
1283         }
1284 
1285         sq = req->sq;
1286         req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1287         req->cqe.sq_id = cpu_to_le16(sq->sqid);
1288         req->cqe.sq_head = cpu_to_le16(sq->head);
1289         addr = cq->dma_addr + cq->tail * n->cqe_size;
1290         ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1291                             sizeof(req->cqe));
1292         if (ret) {
1293             trace_pci_nvme_err_addr_write(addr);
1294             trace_pci_nvme_err_cfs();
1295             stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1296             break;
1297         }
1298         QTAILQ_REMOVE(&cq->req_list, req, entry);
1299         nvme_inc_cq_tail(cq);
1300         nvme_sg_unmap(&req->sg);
1301         QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1302     }
1303     if (cq->tail != cq->head) {
1304         if (cq->irq_enabled && !pending) {
1305             n->cq_pending++;
1306         }
1307 
1308         nvme_irq_assert(n, cq);
1309     }
1310 }
1311 
1312 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1313 {
1314     assert(cq->cqid == req->sq->cqid);
1315     trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1316                                           le32_to_cpu(req->cqe.result),
1317                                           le32_to_cpu(req->cqe.dw1),
1318                                           req->status);
1319 
1320     if (req->status) {
1321         trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1322                                       req->status, req->cmd.opcode);
1323     }
1324 
1325     QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1326     QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1327     timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1328 }
1329 
1330 static void nvme_process_aers(void *opaque)
1331 {
1332     NvmeCtrl *n = opaque;
1333     NvmeAsyncEvent *event, *next;
1334 
1335     trace_pci_nvme_process_aers(n->aer_queued);
1336 
1337     QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1338         NvmeRequest *req;
1339         NvmeAerResult *result;
1340 
1341         /* can't post cqe if there is nothing to complete */
1342         if (!n->outstanding_aers) {
1343             trace_pci_nvme_no_outstanding_aers();
1344             break;
1345         }
1346 
1347         /* ignore if masked (cqe posted, but event not cleared) */
1348         if (n->aer_mask & (1 << event->result.event_type)) {
1349             trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1350             continue;
1351         }
1352 
1353         QTAILQ_REMOVE(&n->aer_queue, event, entry);
1354         n->aer_queued--;
1355 
1356         n->aer_mask |= 1 << event->result.event_type;
1357         n->outstanding_aers--;
1358 
1359         req = n->aer_reqs[n->outstanding_aers];
1360 
1361         result = (NvmeAerResult *) &req->cqe.result;
1362         result->event_type = event->result.event_type;
1363         result->event_info = event->result.event_info;
1364         result->log_page = event->result.log_page;
1365         g_free(event);
1366 
1367         trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1368                                     result->log_page);
1369 
1370         nvme_enqueue_req_completion(&n->admin_cq, req);
1371     }
1372 }
1373 
1374 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1375                                uint8_t event_info, uint8_t log_page)
1376 {
1377     NvmeAsyncEvent *event;
1378 
1379     trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1380 
1381     if (n->aer_queued == n->params.aer_max_queued) {
1382         trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1383         return;
1384     }
1385 
1386     event = g_new(NvmeAsyncEvent, 1);
1387     event->result = (NvmeAerResult) {
1388         .event_type = event_type,
1389         .event_info = event_info,
1390         .log_page   = log_page,
1391     };
1392 
1393     QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1394     n->aer_queued++;
1395 
1396     nvme_process_aers(n);
1397 }
1398 
1399 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1400 {
1401     uint8_t aer_info;
1402 
1403     /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1404     if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1405         return;
1406     }
1407 
1408     switch (event) {
1409     case NVME_SMART_SPARE:
1410         aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1411         break;
1412     case NVME_SMART_TEMPERATURE:
1413         aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1414         break;
1415     case NVME_SMART_RELIABILITY:
1416     case NVME_SMART_MEDIA_READ_ONLY:
1417     case NVME_SMART_FAILED_VOLATILE_MEDIA:
1418     case NVME_SMART_PMR_UNRELIABLE:
1419         aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1420         break;
1421     default:
1422         return;
1423     }
1424 
1425     nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1426 }
1427 
1428 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1429 {
1430     n->aer_mask &= ~(1 << event_type);
1431     if (!QTAILQ_EMPTY(&n->aer_queue)) {
1432         nvme_process_aers(n);
1433     }
1434 }
1435 
1436 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1437 {
1438     uint8_t mdts = n->params.mdts;
1439 
1440     if (mdts && len > n->page_size << mdts) {
1441         trace_pci_nvme_err_mdts(len);
1442         return NVME_INVALID_FIELD | NVME_DNR;
1443     }
1444 
1445     return NVME_SUCCESS;
1446 }
1447 
1448 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1449                                          uint32_t nlb)
1450 {
1451     uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1452 
1453     if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1454         trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1455         return NVME_LBA_RANGE | NVME_DNR;
1456     }
1457 
1458     return NVME_SUCCESS;
1459 }
1460 
1461 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1462                                  uint32_t nlb, int flags)
1463 {
1464     BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1465 
1466     int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1467     int64_t offset = nvme_l2b(ns, slba);
1468     int ret;
1469 
1470     /*
1471      * `pnum` holds the number of bytes after offset that shares the same
1472      * allocation status as the byte at offset. If `pnum` is different from
1473      * `bytes`, we should check the allocation status of the next range and
1474      * continue this until all bytes have been checked.
1475      */
1476     do {
1477         bytes -= pnum;
1478 
1479         ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1480         if (ret < 0) {
1481             return ret;
1482         }
1483 
1484 
1485         trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1486                                     !!(ret & BDRV_BLOCK_ZERO));
1487 
1488         if (!(ret & flags)) {
1489             return 1;
1490         }
1491 
1492         offset += pnum;
1493     } while (pnum != bytes);
1494 
1495     return 0;
1496 }
1497 
1498 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1499                                  uint32_t nlb)
1500 {
1501     int ret;
1502     Error *err = NULL;
1503 
1504     ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1505     if (ret) {
1506         if (ret < 0) {
1507             error_setg_errno(&err, -ret, "unable to get block status");
1508             error_report_err(err);
1509 
1510             return NVME_INTERNAL_DEV_ERROR;
1511         }
1512 
1513         return NVME_DULB;
1514     }
1515 
1516     return NVME_SUCCESS;
1517 }
1518 
1519 static void nvme_aio_err(NvmeRequest *req, int ret)
1520 {
1521     uint16_t status = NVME_SUCCESS;
1522     Error *local_err = NULL;
1523 
1524     switch (req->cmd.opcode) {
1525     case NVME_CMD_READ:
1526         status = NVME_UNRECOVERED_READ;
1527         break;
1528     case NVME_CMD_FLUSH:
1529     case NVME_CMD_WRITE:
1530     case NVME_CMD_WRITE_ZEROES:
1531     case NVME_CMD_ZONE_APPEND:
1532         status = NVME_WRITE_FAULT;
1533         break;
1534     default:
1535         status = NVME_INTERNAL_DEV_ERROR;
1536         break;
1537     }
1538 
1539     trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1540 
1541     error_setg_errno(&local_err, -ret, "aio failed");
1542     error_report_err(local_err);
1543 
1544     /*
1545      * Set the command status code to the first encountered error but allow a
1546      * subsequent Internal Device Error to trump it.
1547      */
1548     if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1549         return;
1550     }
1551 
1552     req->status = status;
1553 }
1554 
1555 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1556 {
1557     return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1558                                     slba / ns->zone_size;
1559 }
1560 
1561 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1562 {
1563     uint32_t zone_idx = nvme_zone_idx(ns, slba);
1564 
1565     if (zone_idx >= ns->num_zones) {
1566         return NULL;
1567     }
1568 
1569     return &ns->zone_array[zone_idx];
1570 }
1571 
1572 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1573 {
1574     uint64_t zslba = zone->d.zslba;
1575 
1576     switch (nvme_get_zone_state(zone)) {
1577     case NVME_ZONE_STATE_EMPTY:
1578     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1579     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1580     case NVME_ZONE_STATE_CLOSED:
1581         return NVME_SUCCESS;
1582     case NVME_ZONE_STATE_FULL:
1583         trace_pci_nvme_err_zone_is_full(zslba);
1584         return NVME_ZONE_FULL;
1585     case NVME_ZONE_STATE_OFFLINE:
1586         trace_pci_nvme_err_zone_is_offline(zslba);
1587         return NVME_ZONE_OFFLINE;
1588     case NVME_ZONE_STATE_READ_ONLY:
1589         trace_pci_nvme_err_zone_is_read_only(zslba);
1590         return NVME_ZONE_READ_ONLY;
1591     default:
1592         assert(false);
1593     }
1594 
1595     return NVME_INTERNAL_DEV_ERROR;
1596 }
1597 
1598 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1599                                       uint64_t slba, uint32_t nlb)
1600 {
1601     uint64_t zcap = nvme_zone_wr_boundary(zone);
1602     uint16_t status;
1603 
1604     status = nvme_check_zone_state_for_write(zone);
1605     if (status) {
1606         return status;
1607     }
1608 
1609     if (unlikely(slba != zone->w_ptr)) {
1610         trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
1611         return NVME_ZONE_INVALID_WRITE;
1612     }
1613 
1614     if (unlikely((slba + nlb) > zcap)) {
1615         trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1616         return NVME_ZONE_BOUNDARY_ERROR;
1617     }
1618 
1619     return NVME_SUCCESS;
1620 }
1621 
1622 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1623 {
1624     switch (nvme_get_zone_state(zone)) {
1625     case NVME_ZONE_STATE_EMPTY:
1626     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1627     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1628     case NVME_ZONE_STATE_FULL:
1629     case NVME_ZONE_STATE_CLOSED:
1630     case NVME_ZONE_STATE_READ_ONLY:
1631         return NVME_SUCCESS;
1632     case NVME_ZONE_STATE_OFFLINE:
1633         trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1634         return NVME_ZONE_OFFLINE;
1635     default:
1636         assert(false);
1637     }
1638 
1639     return NVME_INTERNAL_DEV_ERROR;
1640 }
1641 
1642 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1643                                      uint32_t nlb)
1644 {
1645     NvmeZone *zone;
1646     uint64_t bndry, end;
1647     uint16_t status;
1648 
1649     zone = nvme_get_zone_by_slba(ns, slba);
1650     assert(zone);
1651 
1652     bndry = nvme_zone_rd_boundary(ns, zone);
1653     end = slba + nlb;
1654 
1655     status = nvme_check_zone_state_for_read(zone);
1656     if (status) {
1657         ;
1658     } else if (unlikely(end > bndry)) {
1659         if (!ns->params.cross_zone_read) {
1660             status = NVME_ZONE_BOUNDARY_ERROR;
1661         } else {
1662             /*
1663              * Read across zone boundary - check that all subsequent
1664              * zones that are being read have an appropriate state.
1665              */
1666             do {
1667                 zone++;
1668                 status = nvme_check_zone_state_for_read(zone);
1669                 if (status) {
1670                     break;
1671                 }
1672             } while (end > nvme_zone_rd_boundary(ns, zone));
1673         }
1674     }
1675 
1676     return status;
1677 }
1678 
1679 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1680 {
1681     switch (nvme_get_zone_state(zone)) {
1682     case NVME_ZONE_STATE_FULL:
1683         return NVME_SUCCESS;
1684 
1685     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1686     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1687         nvme_aor_dec_open(ns);
1688         /* fallthrough */
1689     case NVME_ZONE_STATE_CLOSED:
1690         nvme_aor_dec_active(ns);
1691         /* fallthrough */
1692     case NVME_ZONE_STATE_EMPTY:
1693         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1694         return NVME_SUCCESS;
1695 
1696     default:
1697         return NVME_ZONE_INVAL_TRANSITION;
1698     }
1699 }
1700 
1701 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1702 {
1703     switch (nvme_get_zone_state(zone)) {
1704     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1705     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1706         nvme_aor_dec_open(ns);
1707         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1708         /* fall through */
1709     case NVME_ZONE_STATE_CLOSED:
1710         return NVME_SUCCESS;
1711 
1712     default:
1713         return NVME_ZONE_INVAL_TRANSITION;
1714     }
1715 }
1716 
1717 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1718 {
1719     switch (nvme_get_zone_state(zone)) {
1720     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1721     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1722         nvme_aor_dec_open(ns);
1723         /* fallthrough */
1724     case NVME_ZONE_STATE_CLOSED:
1725         nvme_aor_dec_active(ns);
1726         /* fallthrough */
1727     case NVME_ZONE_STATE_FULL:
1728         zone->w_ptr = zone->d.zslba;
1729         zone->d.wp = zone->w_ptr;
1730         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1731         /* fallthrough */
1732     case NVME_ZONE_STATE_EMPTY:
1733         return NVME_SUCCESS;
1734 
1735     default:
1736         return NVME_ZONE_INVAL_TRANSITION;
1737     }
1738 }
1739 
1740 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1741 {
1742     NvmeZone *zone;
1743 
1744     if (ns->params.max_open_zones &&
1745         ns->nr_open_zones == ns->params.max_open_zones) {
1746         zone = QTAILQ_FIRST(&ns->imp_open_zones);
1747         if (zone) {
1748             /*
1749              * Automatically close this implicitly open zone.
1750              */
1751             QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1752             nvme_zrm_close(ns, zone);
1753         }
1754     }
1755 }
1756 
1757 enum {
1758     NVME_ZRM_AUTO = 1 << 0,
1759 };
1760 
1761 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1762                                     NvmeZone *zone, int flags)
1763 {
1764     int act = 0;
1765     uint16_t status;
1766 
1767     switch (nvme_get_zone_state(zone)) {
1768     case NVME_ZONE_STATE_EMPTY:
1769         act = 1;
1770 
1771         /* fallthrough */
1772 
1773     case NVME_ZONE_STATE_CLOSED:
1774         if (n->params.auto_transition_zones) {
1775             nvme_zrm_auto_transition_zone(ns);
1776         }
1777         status = nvme_aor_check(ns, act, 1);
1778         if (status) {
1779             return status;
1780         }
1781 
1782         if (act) {
1783             nvme_aor_inc_active(ns);
1784         }
1785 
1786         nvme_aor_inc_open(ns);
1787 
1788         if (flags & NVME_ZRM_AUTO) {
1789             nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1790             return NVME_SUCCESS;
1791         }
1792 
1793         /* fallthrough */
1794 
1795     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1796         if (flags & NVME_ZRM_AUTO) {
1797             return NVME_SUCCESS;
1798         }
1799 
1800         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1801 
1802         /* fallthrough */
1803 
1804     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1805         return NVME_SUCCESS;
1806 
1807     default:
1808         return NVME_ZONE_INVAL_TRANSITION;
1809     }
1810 }
1811 
1812 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
1813                                      NvmeZone *zone)
1814 {
1815     return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
1816 }
1817 
1818 static inline uint16_t nvme_zrm_open(NvmeCtrl *n, NvmeNamespace *ns,
1819                                      NvmeZone *zone)
1820 {
1821     return nvme_zrm_open_flags(n, ns, zone, 0);
1822 }
1823 
1824 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1825                                  uint32_t nlb)
1826 {
1827     zone->d.wp += nlb;
1828 
1829     if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1830         nvme_zrm_finish(ns, zone);
1831     }
1832 }
1833 
1834 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1835 {
1836     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1837     NvmeZone *zone;
1838     uint64_t slba;
1839     uint32_t nlb;
1840 
1841     slba = le64_to_cpu(rw->slba);
1842     nlb = le16_to_cpu(rw->nlb) + 1;
1843     zone = nvme_get_zone_by_slba(ns, slba);
1844     assert(zone);
1845 
1846     nvme_advance_zone_wp(ns, zone, nlb);
1847 }
1848 
1849 static inline bool nvme_is_write(NvmeRequest *req)
1850 {
1851     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1852 
1853     return rw->opcode == NVME_CMD_WRITE ||
1854            rw->opcode == NVME_CMD_ZONE_APPEND ||
1855            rw->opcode == NVME_CMD_WRITE_ZEROES;
1856 }
1857 
1858 static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
1859 {
1860     return qemu_get_aio_context();
1861 }
1862 
1863 static void nvme_misc_cb(void *opaque, int ret)
1864 {
1865     NvmeRequest *req = opaque;
1866 
1867     trace_pci_nvme_misc_cb(nvme_cid(req));
1868 
1869     if (ret) {
1870         nvme_aio_err(req, ret);
1871     }
1872 
1873     nvme_enqueue_req_completion(nvme_cq(req), req);
1874 }
1875 
1876 void nvme_rw_complete_cb(void *opaque, int ret)
1877 {
1878     NvmeRequest *req = opaque;
1879     NvmeNamespace *ns = req->ns;
1880     BlockBackend *blk = ns->blkconf.blk;
1881     BlockAcctCookie *acct = &req->acct;
1882     BlockAcctStats *stats = blk_get_stats(blk);
1883 
1884     trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1885 
1886     if (ret) {
1887         block_acct_failed(stats, acct);
1888         nvme_aio_err(req, ret);
1889     } else {
1890         block_acct_done(stats, acct);
1891     }
1892 
1893     if (ns->params.zoned && nvme_is_write(req)) {
1894         nvme_finalize_zoned_write(ns, req);
1895     }
1896 
1897     nvme_enqueue_req_completion(nvme_cq(req), req);
1898 }
1899 
1900 static void nvme_rw_cb(void *opaque, int ret)
1901 {
1902     NvmeRequest *req = opaque;
1903     NvmeNamespace *ns = req->ns;
1904 
1905     BlockBackend *blk = ns->blkconf.blk;
1906 
1907     trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1908 
1909     if (ret) {
1910         goto out;
1911     }
1912 
1913     if (ns->lbaf.ms) {
1914         NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1915         uint64_t slba = le64_to_cpu(rw->slba);
1916         uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
1917         uint64_t offset = nvme_moff(ns, slba);
1918 
1919         if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
1920             size_t mlen = nvme_m2b(ns, nlb);
1921 
1922             req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
1923                                                BDRV_REQ_MAY_UNMAP,
1924                                                nvme_rw_complete_cb, req);
1925             return;
1926         }
1927 
1928         if (nvme_ns_ext(ns) || req->cmd.mptr) {
1929             uint16_t status;
1930 
1931             nvme_sg_unmap(&req->sg);
1932             status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
1933             if (status) {
1934                 ret = -EFAULT;
1935                 goto out;
1936             }
1937 
1938             if (req->cmd.opcode == NVME_CMD_READ) {
1939                 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
1940             }
1941 
1942             return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
1943         }
1944     }
1945 
1946 out:
1947     nvme_rw_complete_cb(req, ret);
1948 }
1949 
1950 static void nvme_verify_cb(void *opaque, int ret)
1951 {
1952     NvmeBounceContext *ctx = opaque;
1953     NvmeRequest *req = ctx->req;
1954     NvmeNamespace *ns = req->ns;
1955     BlockBackend *blk = ns->blkconf.blk;
1956     BlockAcctCookie *acct = &req->acct;
1957     BlockAcctStats *stats = blk_get_stats(blk);
1958     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1959     uint64_t slba = le64_to_cpu(rw->slba);
1960     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
1961     uint16_t apptag = le16_to_cpu(rw->apptag);
1962     uint16_t appmask = le16_to_cpu(rw->appmask);
1963     uint32_t reftag = le32_to_cpu(rw->reftag);
1964     uint16_t status;
1965 
1966     trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
1967 
1968     if (ret) {
1969         block_acct_failed(stats, acct);
1970         nvme_aio_err(req, ret);
1971         goto out;
1972     }
1973 
1974     block_acct_done(stats, acct);
1975 
1976     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
1977         status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
1978                                        ctx->mdata.iov.size, slba);
1979         if (status) {
1980             req->status = status;
1981             goto out;
1982         }
1983 
1984         req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
1985                                      ctx->mdata.bounce, ctx->mdata.iov.size,
1986                                      prinfo, slba, apptag, appmask, &reftag);
1987     }
1988 
1989 out:
1990     qemu_iovec_destroy(&ctx->data.iov);
1991     g_free(ctx->data.bounce);
1992 
1993     qemu_iovec_destroy(&ctx->mdata.iov);
1994     g_free(ctx->mdata.bounce);
1995 
1996     g_free(ctx);
1997 
1998     nvme_enqueue_req_completion(nvme_cq(req), req);
1999 }
2000 
2001 
2002 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2003 {
2004     NvmeBounceContext *ctx = opaque;
2005     NvmeRequest *req = ctx->req;
2006     NvmeNamespace *ns = req->ns;
2007     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2008     uint64_t slba = le64_to_cpu(rw->slba);
2009     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2010     size_t mlen = nvme_m2b(ns, nlb);
2011     uint64_t offset = nvme_moff(ns, slba);
2012     BlockBackend *blk = ns->blkconf.blk;
2013 
2014     trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2015 
2016     if (ret) {
2017         goto out;
2018     }
2019 
2020     ctx->mdata.bounce = g_malloc(mlen);
2021 
2022     qemu_iovec_reset(&ctx->mdata.iov);
2023     qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2024 
2025     req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2026                                 nvme_verify_cb, ctx);
2027     return;
2028 
2029 out:
2030     nvme_verify_cb(ctx, ret);
2031 }
2032 
2033 struct nvme_compare_ctx {
2034     struct {
2035         QEMUIOVector iov;
2036         uint8_t *bounce;
2037     } data;
2038 
2039     struct {
2040         QEMUIOVector iov;
2041         uint8_t *bounce;
2042     } mdata;
2043 };
2044 
2045 static void nvme_compare_mdata_cb(void *opaque, int ret)
2046 {
2047     NvmeRequest *req = opaque;
2048     NvmeNamespace *ns = req->ns;
2049     NvmeCtrl *n = nvme_ctrl(req);
2050     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2051     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2052     uint16_t apptag = le16_to_cpu(rw->apptag);
2053     uint16_t appmask = le16_to_cpu(rw->appmask);
2054     uint32_t reftag = le32_to_cpu(rw->reftag);
2055     struct nvme_compare_ctx *ctx = req->opaque;
2056     g_autofree uint8_t *buf = NULL;
2057     BlockBackend *blk = ns->blkconf.blk;
2058     BlockAcctCookie *acct = &req->acct;
2059     BlockAcctStats *stats = blk_get_stats(blk);
2060     uint16_t status = NVME_SUCCESS;
2061 
2062     trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2063 
2064     if (ret) {
2065         block_acct_failed(stats, acct);
2066         nvme_aio_err(req, ret);
2067         goto out;
2068     }
2069 
2070     buf = g_malloc(ctx->mdata.iov.size);
2071 
2072     status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2073                                NVME_TX_DIRECTION_TO_DEVICE, req);
2074     if (status) {
2075         req->status = status;
2076         goto out;
2077     }
2078 
2079     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2080         uint64_t slba = le64_to_cpu(rw->slba);
2081         uint8_t *bufp;
2082         uint8_t *mbufp = ctx->mdata.bounce;
2083         uint8_t *end = mbufp + ctx->mdata.iov.size;
2084         int16_t pil = 0;
2085 
2086         status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2087                                 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2088                                 slba, apptag, appmask, &reftag);
2089         if (status) {
2090             req->status = status;
2091             goto out;
2092         }
2093 
2094         /*
2095          * When formatted with protection information, do not compare the DIF
2096          * tuple.
2097          */
2098         if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2099             pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
2100         }
2101 
2102         for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2103             if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2104                 req->status = NVME_CMP_FAILURE;
2105                 goto out;
2106             }
2107         }
2108 
2109         goto out;
2110     }
2111 
2112     if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2113         req->status = NVME_CMP_FAILURE;
2114         goto out;
2115     }
2116 
2117     block_acct_done(stats, acct);
2118 
2119 out:
2120     qemu_iovec_destroy(&ctx->data.iov);
2121     g_free(ctx->data.bounce);
2122 
2123     qemu_iovec_destroy(&ctx->mdata.iov);
2124     g_free(ctx->mdata.bounce);
2125 
2126     g_free(ctx);
2127 
2128     nvme_enqueue_req_completion(nvme_cq(req), req);
2129 }
2130 
2131 static void nvme_compare_data_cb(void *opaque, int ret)
2132 {
2133     NvmeRequest *req = opaque;
2134     NvmeCtrl *n = nvme_ctrl(req);
2135     NvmeNamespace *ns = req->ns;
2136     BlockBackend *blk = ns->blkconf.blk;
2137     BlockAcctCookie *acct = &req->acct;
2138     BlockAcctStats *stats = blk_get_stats(blk);
2139 
2140     struct nvme_compare_ctx *ctx = req->opaque;
2141     g_autofree uint8_t *buf = NULL;
2142     uint16_t status;
2143 
2144     trace_pci_nvme_compare_data_cb(nvme_cid(req));
2145 
2146     if (ret) {
2147         block_acct_failed(stats, acct);
2148         nvme_aio_err(req, ret);
2149         goto out;
2150     }
2151 
2152     buf = g_malloc(ctx->data.iov.size);
2153 
2154     status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2155                               NVME_TX_DIRECTION_TO_DEVICE, req);
2156     if (status) {
2157         req->status = status;
2158         goto out;
2159     }
2160 
2161     if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2162         req->status = NVME_CMP_FAILURE;
2163         goto out;
2164     }
2165 
2166     if (ns->lbaf.ms) {
2167         NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2168         uint64_t slba = le64_to_cpu(rw->slba);
2169         uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2170         size_t mlen = nvme_m2b(ns, nlb);
2171         uint64_t offset = nvme_moff(ns, slba);
2172 
2173         ctx->mdata.bounce = g_malloc(mlen);
2174 
2175         qemu_iovec_init(&ctx->mdata.iov, 1);
2176         qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2177 
2178         req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2179                                     nvme_compare_mdata_cb, req);
2180         return;
2181     }
2182 
2183     block_acct_done(stats, acct);
2184 
2185 out:
2186     qemu_iovec_destroy(&ctx->data.iov);
2187     g_free(ctx->data.bounce);
2188     g_free(ctx);
2189 
2190     nvme_enqueue_req_completion(nvme_cq(req), req);
2191 }
2192 
2193 typedef struct NvmeDSMAIOCB {
2194     BlockAIOCB common;
2195     BlockAIOCB *aiocb;
2196     NvmeRequest *req;
2197     QEMUBH *bh;
2198     int ret;
2199 
2200     NvmeDsmRange *range;
2201     unsigned int nr;
2202     unsigned int idx;
2203 } NvmeDSMAIOCB;
2204 
2205 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2206 {
2207     NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2208 
2209     /* break nvme_dsm_cb loop */
2210     iocb->idx = iocb->nr;
2211     iocb->ret = -ECANCELED;
2212 
2213     if (iocb->aiocb) {
2214         blk_aio_cancel_async(iocb->aiocb);
2215         iocb->aiocb = NULL;
2216     } else {
2217         /*
2218          * We only reach this if nvme_dsm_cancel() has already been called or
2219          * the command ran to completion and nvme_dsm_bh is scheduled to run.
2220          */
2221         assert(iocb->idx == iocb->nr);
2222     }
2223 }
2224 
2225 static const AIOCBInfo nvme_dsm_aiocb_info = {
2226     .aiocb_size   = sizeof(NvmeDSMAIOCB),
2227     .cancel_async = nvme_dsm_cancel,
2228 };
2229 
2230 static void nvme_dsm_bh(void *opaque)
2231 {
2232     NvmeDSMAIOCB *iocb = opaque;
2233 
2234     iocb->common.cb(iocb->common.opaque, iocb->ret);
2235 
2236     qemu_bh_delete(iocb->bh);
2237     iocb->bh = NULL;
2238     qemu_aio_unref(iocb);
2239 }
2240 
2241 static void nvme_dsm_cb(void *opaque, int ret);
2242 
2243 static void nvme_dsm_md_cb(void *opaque, int ret)
2244 {
2245     NvmeDSMAIOCB *iocb = opaque;
2246     NvmeRequest *req = iocb->req;
2247     NvmeNamespace *ns = req->ns;
2248     NvmeDsmRange *range;
2249     uint64_t slba;
2250     uint32_t nlb;
2251 
2252     if (ret < 0) {
2253         iocb->ret = ret;
2254         goto done;
2255     }
2256 
2257     if (!ns->lbaf.ms) {
2258         nvme_dsm_cb(iocb, 0);
2259         return;
2260     }
2261 
2262     range = &iocb->range[iocb->idx - 1];
2263     slba = le64_to_cpu(range->slba);
2264     nlb = le32_to_cpu(range->nlb);
2265 
2266     /*
2267      * Check that all block were discarded (zeroed); otherwise we do not zero
2268      * the metadata.
2269      */
2270 
2271     ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2272     if (ret) {
2273         if (ret < 0) {
2274             iocb->ret = ret;
2275             goto done;
2276         }
2277 
2278         nvme_dsm_cb(iocb, 0);
2279     }
2280 
2281     iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2282                                         nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2283                                         nvme_dsm_cb, iocb);
2284     return;
2285 
2286 done:
2287     iocb->aiocb = NULL;
2288     qemu_bh_schedule(iocb->bh);
2289 }
2290 
2291 static void nvme_dsm_cb(void *opaque, int ret)
2292 {
2293     NvmeDSMAIOCB *iocb = opaque;
2294     NvmeRequest *req = iocb->req;
2295     NvmeCtrl *n = nvme_ctrl(req);
2296     NvmeNamespace *ns = req->ns;
2297     NvmeDsmRange *range;
2298     uint64_t slba;
2299     uint32_t nlb;
2300 
2301     if (ret < 0) {
2302         iocb->ret = ret;
2303         goto done;
2304     }
2305 
2306 next:
2307     if (iocb->idx == iocb->nr) {
2308         goto done;
2309     }
2310 
2311     range = &iocb->range[iocb->idx++];
2312     slba = le64_to_cpu(range->slba);
2313     nlb = le32_to_cpu(range->nlb);
2314 
2315     trace_pci_nvme_dsm_deallocate(slba, nlb);
2316 
2317     if (nlb > n->dmrsl) {
2318         trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2319         goto next;
2320     }
2321 
2322     if (nvme_check_bounds(ns, slba, nlb)) {
2323         trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2324                                              ns->id_ns.nsze);
2325         goto next;
2326     }
2327 
2328     iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2329                                    nvme_l2b(ns, nlb),
2330                                    nvme_dsm_md_cb, iocb);
2331     return;
2332 
2333 done:
2334     iocb->aiocb = NULL;
2335     qemu_bh_schedule(iocb->bh);
2336 }
2337 
2338 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2339 {
2340     NvmeNamespace *ns = req->ns;
2341     NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2342     uint32_t attr = le32_to_cpu(dsm->attributes);
2343     uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2344     uint16_t status = NVME_SUCCESS;
2345 
2346     trace_pci_nvme_dsm(nr, attr);
2347 
2348     if (attr & NVME_DSMGMT_AD) {
2349         NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2350                                          nvme_misc_cb, req);
2351 
2352         iocb->req = req;
2353         iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
2354         iocb->ret = 0;
2355         iocb->range = g_new(NvmeDsmRange, nr);
2356         iocb->nr = nr;
2357         iocb->idx = 0;
2358 
2359         status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2360                           req);
2361         if (status) {
2362             return status;
2363         }
2364 
2365         req->aiocb = &iocb->common;
2366         nvme_dsm_cb(iocb, 0);
2367 
2368         return NVME_NO_COMPLETE;
2369     }
2370 
2371     return status;
2372 }
2373 
2374 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2375 {
2376     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2377     NvmeNamespace *ns = req->ns;
2378     BlockBackend *blk = ns->blkconf.blk;
2379     uint64_t slba = le64_to_cpu(rw->slba);
2380     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2381     size_t len = nvme_l2b(ns, nlb);
2382     int64_t offset = nvme_l2b(ns, slba);
2383     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2384     uint32_t reftag = le32_to_cpu(rw->reftag);
2385     NvmeBounceContext *ctx = NULL;
2386     uint16_t status;
2387 
2388     trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2389 
2390     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2391         status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2392         if (status) {
2393             return status;
2394         }
2395 
2396         if (prinfo & NVME_PRINFO_PRACT) {
2397             return NVME_INVALID_PROT_INFO | NVME_DNR;
2398         }
2399     }
2400 
2401     if (len > n->page_size << n->params.vsl) {
2402         return NVME_INVALID_FIELD | NVME_DNR;
2403     }
2404 
2405     status = nvme_check_bounds(ns, slba, nlb);
2406     if (status) {
2407         return status;
2408     }
2409 
2410     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2411         status = nvme_check_dulbe(ns, slba, nlb);
2412         if (status) {
2413             return status;
2414         }
2415     }
2416 
2417     ctx = g_new0(NvmeBounceContext, 1);
2418     ctx->req = req;
2419 
2420     ctx->data.bounce = g_malloc(len);
2421 
2422     qemu_iovec_init(&ctx->data.iov, 1);
2423     qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2424 
2425     block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2426                      BLOCK_ACCT_READ);
2427 
2428     req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2429                                 nvme_verify_mdata_in_cb, ctx);
2430     return NVME_NO_COMPLETE;
2431 }
2432 
2433 typedef struct NvmeCopyAIOCB {
2434     BlockAIOCB common;
2435     BlockAIOCB *aiocb;
2436     NvmeRequest *req;
2437     QEMUBH *bh;
2438     int ret;
2439 
2440     NvmeCopySourceRange *ranges;
2441     int nr;
2442     int idx;
2443 
2444     uint8_t *bounce;
2445     QEMUIOVector iov;
2446     struct {
2447         BlockAcctCookie read;
2448         BlockAcctCookie write;
2449     } acct;
2450 
2451     uint32_t reftag;
2452     uint64_t slba;
2453 
2454     NvmeZone *zone;
2455 } NvmeCopyAIOCB;
2456 
2457 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2458 {
2459     NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2460 
2461     iocb->ret = -ECANCELED;
2462 
2463     if (iocb->aiocb) {
2464         blk_aio_cancel_async(iocb->aiocb);
2465         iocb->aiocb = NULL;
2466     }
2467 }
2468 
2469 static const AIOCBInfo nvme_copy_aiocb_info = {
2470     .aiocb_size   = sizeof(NvmeCopyAIOCB),
2471     .cancel_async = nvme_copy_cancel,
2472 };
2473 
2474 static void nvme_copy_bh(void *opaque)
2475 {
2476     NvmeCopyAIOCB *iocb = opaque;
2477     NvmeRequest *req = iocb->req;
2478     NvmeNamespace *ns = req->ns;
2479     BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2480 
2481     if (iocb->idx != iocb->nr) {
2482         req->cqe.result = cpu_to_le32(iocb->idx);
2483     }
2484 
2485     qemu_iovec_destroy(&iocb->iov);
2486     g_free(iocb->bounce);
2487 
2488     qemu_bh_delete(iocb->bh);
2489     iocb->bh = NULL;
2490 
2491     if (iocb->ret < 0) {
2492         block_acct_failed(stats, &iocb->acct.read);
2493         block_acct_failed(stats, &iocb->acct.write);
2494     } else {
2495         block_acct_done(stats, &iocb->acct.read);
2496         block_acct_done(stats, &iocb->acct.write);
2497     }
2498 
2499     iocb->common.cb(iocb->common.opaque, iocb->ret);
2500     qemu_aio_unref(iocb);
2501 }
2502 
2503 static void nvme_copy_cb(void *opaque, int ret);
2504 
2505 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2506 {
2507     NvmeCopyAIOCB *iocb = opaque;
2508     NvmeRequest *req = iocb->req;
2509     NvmeNamespace *ns = req->ns;
2510     NvmeCopySourceRange *range = &iocb->ranges[iocb->idx];
2511     uint32_t nlb = le32_to_cpu(range->nlb) + 1;
2512 
2513     if (ret < 0) {
2514         iocb->ret = ret;
2515         goto out;
2516     } else if (iocb->ret < 0) {
2517         goto out;
2518     }
2519 
2520     if (ns->params.zoned) {
2521         nvme_advance_zone_wp(ns, iocb->zone, nlb);
2522     }
2523 
2524     iocb->idx++;
2525     iocb->slba += nlb;
2526 out:
2527     nvme_copy_cb(iocb, iocb->ret);
2528 }
2529 
2530 static void nvme_copy_out_cb(void *opaque, int ret)
2531 {
2532     NvmeCopyAIOCB *iocb = opaque;
2533     NvmeRequest *req = iocb->req;
2534     NvmeNamespace *ns = req->ns;
2535     NvmeCopySourceRange *range;
2536     uint32_t nlb;
2537     size_t mlen;
2538     uint8_t *mbounce;
2539 
2540     if (ret < 0) {
2541         iocb->ret = ret;
2542         goto out;
2543     } else if (iocb->ret < 0) {
2544         goto out;
2545     }
2546 
2547     if (!ns->lbaf.ms) {
2548         nvme_copy_out_completed_cb(iocb, 0);
2549         return;
2550     }
2551 
2552     range = &iocb->ranges[iocb->idx];
2553     nlb = le32_to_cpu(range->nlb) + 1;
2554 
2555     mlen = nvme_m2b(ns, nlb);
2556     mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2557 
2558     qemu_iovec_reset(&iocb->iov);
2559     qemu_iovec_add(&iocb->iov, mbounce, mlen);
2560 
2561     iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
2562                                   &iocb->iov, 0, nvme_copy_out_completed_cb,
2563                                   iocb);
2564 
2565     return;
2566 
2567 out:
2568     nvme_copy_cb(iocb, ret);
2569 }
2570 
2571 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2572 {
2573     NvmeCopyAIOCB *iocb = opaque;
2574     NvmeRequest *req = iocb->req;
2575     NvmeNamespace *ns = req->ns;
2576     NvmeCopySourceRange *range;
2577     uint32_t nlb;
2578     size_t len;
2579     uint16_t status;
2580 
2581     if (ret < 0) {
2582         iocb->ret = ret;
2583         goto out;
2584     } else if (iocb->ret < 0) {
2585         goto out;
2586     }
2587 
2588     range = &iocb->ranges[iocb->idx];
2589     nlb = le32_to_cpu(range->nlb) + 1;
2590     len = nvme_l2b(ns, nlb);
2591 
2592     trace_pci_nvme_copy_out(iocb->slba, nlb);
2593 
2594     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2595         NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2596 
2597         uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2598         uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2599 
2600         uint16_t apptag = le16_to_cpu(range->apptag);
2601         uint16_t appmask = le16_to_cpu(range->appmask);
2602         uint32_t reftag = le32_to_cpu(range->reftag);
2603 
2604         uint64_t slba = le64_to_cpu(range->slba);
2605         size_t mlen = nvme_m2b(ns, nlb);
2606         uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2607 
2608         status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
2609                                 slba, apptag, appmask, &reftag);
2610         if (status) {
2611             goto invalid;
2612         }
2613 
2614         apptag = le16_to_cpu(copy->apptag);
2615         appmask = le16_to_cpu(copy->appmask);
2616 
2617         if (prinfow & NVME_PRINFO_PRACT) {
2618             status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
2619             if (status) {
2620                 goto invalid;
2621             }
2622 
2623             nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
2624                                         apptag, &iocb->reftag);
2625         } else {
2626             status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
2627                                     prinfow, iocb->slba, apptag, appmask,
2628                                     &iocb->reftag);
2629             if (status) {
2630                 goto invalid;
2631             }
2632         }
2633     }
2634 
2635     status = nvme_check_bounds(ns, iocb->slba, nlb);
2636     if (status) {
2637         goto invalid;
2638     }
2639 
2640     if (ns->params.zoned) {
2641         status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
2642         if (status) {
2643             goto invalid;
2644         }
2645 
2646         iocb->zone->w_ptr += nlb;
2647     }
2648 
2649     qemu_iovec_reset(&iocb->iov);
2650     qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2651 
2652     iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
2653                                   &iocb->iov, 0, nvme_copy_out_cb, iocb);
2654 
2655     return;
2656 
2657 invalid:
2658     req->status = status;
2659     iocb->aiocb = NULL;
2660     if (iocb->bh) {
2661         qemu_bh_schedule(iocb->bh);
2662     }
2663 
2664     return;
2665 
2666 out:
2667     nvme_copy_cb(iocb, ret);
2668 }
2669 
2670 static void nvme_copy_in_cb(void *opaque, int ret)
2671 {
2672     NvmeCopyAIOCB *iocb = opaque;
2673     NvmeRequest *req = iocb->req;
2674     NvmeNamespace *ns = req->ns;
2675     NvmeCopySourceRange *range;
2676     uint64_t slba;
2677     uint32_t nlb;
2678 
2679     if (ret < 0) {
2680         iocb->ret = ret;
2681         goto out;
2682     } else if (iocb->ret < 0) {
2683         goto out;
2684     }
2685 
2686     if (!ns->lbaf.ms) {
2687         nvme_copy_in_completed_cb(iocb, 0);
2688         return;
2689     }
2690 
2691     range = &iocb->ranges[iocb->idx];
2692     slba = le64_to_cpu(range->slba);
2693     nlb = le32_to_cpu(range->nlb) + 1;
2694 
2695     qemu_iovec_reset(&iocb->iov);
2696     qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
2697                    nvme_m2b(ns, nlb));
2698 
2699     iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
2700                                  &iocb->iov, 0, nvme_copy_in_completed_cb,
2701                                  iocb);
2702     return;
2703 
2704 out:
2705     nvme_copy_cb(iocb, iocb->ret);
2706 }
2707 
2708 static void nvme_copy_cb(void *opaque, int ret)
2709 {
2710     NvmeCopyAIOCB *iocb = opaque;
2711     NvmeRequest *req = iocb->req;
2712     NvmeNamespace *ns = req->ns;
2713     NvmeCopySourceRange *range;
2714     uint64_t slba;
2715     uint32_t nlb;
2716     size_t len;
2717     uint16_t status;
2718 
2719     if (ret < 0) {
2720         iocb->ret = ret;
2721         goto done;
2722     } else if (iocb->ret < 0) {
2723         goto done;
2724     }
2725 
2726     if (iocb->idx == iocb->nr) {
2727         goto done;
2728     }
2729 
2730     range = &iocb->ranges[iocb->idx];
2731     slba = le64_to_cpu(range->slba);
2732     nlb = le32_to_cpu(range->nlb) + 1;
2733     len = nvme_l2b(ns, nlb);
2734 
2735     trace_pci_nvme_copy_source_range(slba, nlb);
2736 
2737     if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2738         status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2739         goto invalid;
2740     }
2741 
2742     status = nvme_check_bounds(ns, slba, nlb);
2743     if (status) {
2744         goto invalid;
2745     }
2746 
2747     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2748         status = nvme_check_dulbe(ns, slba, nlb);
2749         if (status) {
2750             goto invalid;
2751         }
2752     }
2753 
2754     if (ns->params.zoned) {
2755         status = nvme_check_zone_read(ns, slba, nlb);
2756         if (status) {
2757             goto invalid;
2758         }
2759     }
2760 
2761     qemu_iovec_reset(&iocb->iov);
2762     qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2763 
2764     iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
2765                                  &iocb->iov, 0, nvme_copy_in_cb, iocb);
2766     return;
2767 
2768 invalid:
2769     req->status = status;
2770 done:
2771     iocb->aiocb = NULL;
2772     if (iocb->bh) {
2773         qemu_bh_schedule(iocb->bh);
2774     }
2775 }
2776 
2777 
2778 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2779 {
2780     NvmeNamespace *ns = req->ns;
2781     NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2782     NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
2783                                       nvme_misc_cb, req);
2784     uint16_t nr = copy->nr + 1;
2785     uint8_t format = copy->control[0] & 0xf;
2786     uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2787     uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2788 
2789     uint16_t status;
2790 
2791     trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2792 
2793     iocb->ranges = NULL;
2794     iocb->zone = NULL;
2795 
2796     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2797         ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
2798         status = NVME_INVALID_FIELD | NVME_DNR;
2799         goto invalid;
2800     }
2801 
2802     if (!(n->id_ctrl.ocfs & (1 << format))) {
2803         trace_pci_nvme_err_copy_invalid_format(format);
2804         status = NVME_INVALID_FIELD | NVME_DNR;
2805         goto invalid;
2806     }
2807 
2808     if (nr > ns->id_ns.msrc + 1) {
2809         status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2810         goto invalid;
2811     }
2812 
2813     iocb->ranges = g_new(NvmeCopySourceRange, nr);
2814 
2815     status = nvme_h2c(n, (uint8_t *)iocb->ranges,
2816                       sizeof(NvmeCopySourceRange) * nr, req);
2817     if (status) {
2818         goto invalid;
2819     }
2820 
2821     iocb->slba = le64_to_cpu(copy->sdlba);
2822 
2823     if (ns->params.zoned) {
2824         iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
2825         if (!iocb->zone) {
2826             status = NVME_LBA_RANGE | NVME_DNR;
2827             goto invalid;
2828         }
2829 
2830         status = nvme_zrm_auto(n, ns, iocb->zone);
2831         if (status) {
2832             goto invalid;
2833         }
2834     }
2835 
2836     iocb->req = req;
2837     iocb->bh = qemu_bh_new(nvme_copy_bh, iocb);
2838     iocb->ret = 0;
2839     iocb->nr = nr;
2840     iocb->idx = 0;
2841     iocb->reftag = le32_to_cpu(copy->reftag);
2842     iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
2843                               ns->lbasz + ns->lbaf.ms);
2844 
2845     qemu_iovec_init(&iocb->iov, 1);
2846 
2847     block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
2848                      BLOCK_ACCT_READ);
2849     block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
2850                      BLOCK_ACCT_WRITE);
2851 
2852     req->aiocb = &iocb->common;
2853     nvme_copy_cb(iocb, 0);
2854 
2855     return NVME_NO_COMPLETE;
2856 
2857 invalid:
2858     g_free(iocb->ranges);
2859     qemu_aio_unref(iocb);
2860     return status;
2861 }
2862 
2863 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
2864 {
2865     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2866     NvmeNamespace *ns = req->ns;
2867     BlockBackend *blk = ns->blkconf.blk;
2868     uint64_t slba = le64_to_cpu(rw->slba);
2869     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2870     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2871     size_t data_len = nvme_l2b(ns, nlb);
2872     size_t len = data_len;
2873     int64_t offset = nvme_l2b(ns, slba);
2874     struct nvme_compare_ctx *ctx = NULL;
2875     uint16_t status;
2876 
2877     trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2878 
2879     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
2880         return NVME_INVALID_PROT_INFO | NVME_DNR;
2881     }
2882 
2883     if (nvme_ns_ext(ns)) {
2884         len += nvme_m2b(ns, nlb);
2885     }
2886 
2887     status = nvme_check_mdts(n, len);
2888     if (status) {
2889         return status;
2890     }
2891 
2892     status = nvme_check_bounds(ns, slba, nlb);
2893     if (status) {
2894         return status;
2895     }
2896 
2897     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2898         status = nvme_check_dulbe(ns, slba, nlb);
2899         if (status) {
2900             return status;
2901         }
2902     }
2903 
2904     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
2905     if (status) {
2906         return status;
2907     }
2908 
2909     ctx = g_new(struct nvme_compare_ctx, 1);
2910     ctx->data.bounce = g_malloc(data_len);
2911 
2912     req->opaque = ctx;
2913 
2914     qemu_iovec_init(&ctx->data.iov, 1);
2915     qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
2916 
2917     block_acct_start(blk_get_stats(blk), &req->acct, data_len,
2918                      BLOCK_ACCT_READ);
2919     req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
2920                                 nvme_compare_data_cb, req);
2921 
2922     return NVME_NO_COMPLETE;
2923 }
2924 
2925 typedef struct NvmeFlushAIOCB {
2926     BlockAIOCB common;
2927     BlockAIOCB *aiocb;
2928     NvmeRequest *req;
2929     QEMUBH *bh;
2930     int ret;
2931 
2932     NvmeNamespace *ns;
2933     uint32_t nsid;
2934     bool broadcast;
2935 } NvmeFlushAIOCB;
2936 
2937 static void nvme_flush_cancel(BlockAIOCB *acb)
2938 {
2939     NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
2940 
2941     iocb->ret = -ECANCELED;
2942 
2943     if (iocb->aiocb) {
2944         blk_aio_cancel_async(iocb->aiocb);
2945     }
2946 }
2947 
2948 static const AIOCBInfo nvme_flush_aiocb_info = {
2949     .aiocb_size = sizeof(NvmeFlushAIOCB),
2950     .cancel_async = nvme_flush_cancel,
2951     .get_aio_context = nvme_get_aio_context,
2952 };
2953 
2954 static void nvme_flush_ns_cb(void *opaque, int ret)
2955 {
2956     NvmeFlushAIOCB *iocb = opaque;
2957     NvmeNamespace *ns = iocb->ns;
2958 
2959     if (ret < 0) {
2960         iocb->ret = ret;
2961         goto out;
2962     } else if (iocb->ret < 0) {
2963         goto out;
2964     }
2965 
2966     if (ns) {
2967         trace_pci_nvme_flush_ns(iocb->nsid);
2968 
2969         iocb->ns = NULL;
2970         iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
2971         return;
2972     }
2973 
2974 out:
2975     iocb->aiocb = NULL;
2976     qemu_bh_schedule(iocb->bh);
2977 }
2978 
2979 static void nvme_flush_bh(void *opaque)
2980 {
2981     NvmeFlushAIOCB *iocb = opaque;
2982     NvmeRequest *req = iocb->req;
2983     NvmeCtrl *n = nvme_ctrl(req);
2984     int i;
2985 
2986     if (iocb->ret < 0) {
2987         goto done;
2988     }
2989 
2990     if (iocb->broadcast) {
2991         for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
2992             iocb->ns = nvme_ns(n, i);
2993             if (iocb->ns) {
2994                 iocb->nsid = i;
2995                 break;
2996             }
2997         }
2998     }
2999 
3000     if (!iocb->ns) {
3001         goto done;
3002     }
3003 
3004     nvme_flush_ns_cb(iocb, 0);
3005     return;
3006 
3007 done:
3008     qemu_bh_delete(iocb->bh);
3009     iocb->bh = NULL;
3010 
3011     iocb->common.cb(iocb->common.opaque, iocb->ret);
3012 
3013     qemu_aio_unref(iocb);
3014 
3015     return;
3016 }
3017 
3018 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3019 {
3020     NvmeFlushAIOCB *iocb;
3021     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3022     uint16_t status;
3023 
3024     iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3025 
3026     iocb->req = req;
3027     iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
3028     iocb->ret = 0;
3029     iocb->ns = NULL;
3030     iocb->nsid = 0;
3031     iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3032 
3033     if (!iocb->broadcast) {
3034         if (!nvme_nsid_valid(n, nsid)) {
3035             status = NVME_INVALID_NSID | NVME_DNR;
3036             goto out;
3037         }
3038 
3039         iocb->ns = nvme_ns(n, nsid);
3040         if (!iocb->ns) {
3041             status = NVME_INVALID_FIELD | NVME_DNR;
3042             goto out;
3043         }
3044 
3045         iocb->nsid = nsid;
3046     }
3047 
3048     req->aiocb = &iocb->common;
3049     qemu_bh_schedule(iocb->bh);
3050 
3051     return NVME_NO_COMPLETE;
3052 
3053 out:
3054     qemu_bh_delete(iocb->bh);
3055     iocb->bh = NULL;
3056     qemu_aio_unref(iocb);
3057 
3058     return status;
3059 }
3060 
3061 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3062 {
3063     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3064     NvmeNamespace *ns = req->ns;
3065     uint64_t slba = le64_to_cpu(rw->slba);
3066     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3067     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3068     uint64_t data_size = nvme_l2b(ns, nlb);
3069     uint64_t mapped_size = data_size;
3070     uint64_t data_offset;
3071     BlockBackend *blk = ns->blkconf.blk;
3072     uint16_t status;
3073 
3074     if (nvme_ns_ext(ns)) {
3075         mapped_size += nvme_m2b(ns, nlb);
3076 
3077         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3078             bool pract = prinfo & NVME_PRINFO_PRACT;
3079 
3080             if (pract && ns->lbaf.ms == 8) {
3081                 mapped_size = data_size;
3082             }
3083         }
3084     }
3085 
3086     trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3087 
3088     status = nvme_check_mdts(n, mapped_size);
3089     if (status) {
3090         goto invalid;
3091     }
3092 
3093     status = nvme_check_bounds(ns, slba, nlb);
3094     if (status) {
3095         goto invalid;
3096     }
3097 
3098     if (ns->params.zoned) {
3099         status = nvme_check_zone_read(ns, slba, nlb);
3100         if (status) {
3101             trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3102             goto invalid;
3103         }
3104     }
3105 
3106     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3107         status = nvme_check_dulbe(ns, slba, nlb);
3108         if (status) {
3109             goto invalid;
3110         }
3111     }
3112 
3113     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3114         return nvme_dif_rw(n, req);
3115     }
3116 
3117     status = nvme_map_data(n, nlb, req);
3118     if (status) {
3119         goto invalid;
3120     }
3121 
3122     data_offset = nvme_l2b(ns, slba);
3123 
3124     block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3125                      BLOCK_ACCT_READ);
3126     nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
3127     return NVME_NO_COMPLETE;
3128 
3129 invalid:
3130     block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3131     return status | NVME_DNR;
3132 }
3133 
3134 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3135                               bool wrz)
3136 {
3137     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3138     NvmeNamespace *ns = req->ns;
3139     uint64_t slba = le64_to_cpu(rw->slba);
3140     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3141     uint16_t ctrl = le16_to_cpu(rw->control);
3142     uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3143     uint64_t data_size = nvme_l2b(ns, nlb);
3144     uint64_t mapped_size = data_size;
3145     uint64_t data_offset;
3146     NvmeZone *zone;
3147     NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3148     BlockBackend *blk = ns->blkconf.blk;
3149     uint16_t status;
3150 
3151     if (nvme_ns_ext(ns)) {
3152         mapped_size += nvme_m2b(ns, nlb);
3153 
3154         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3155             bool pract = prinfo & NVME_PRINFO_PRACT;
3156 
3157             if (pract && ns->lbaf.ms == 8) {
3158                 mapped_size -= nvme_m2b(ns, nlb);
3159             }
3160         }
3161     }
3162 
3163     trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3164                          nvme_nsid(ns), nlb, mapped_size, slba);
3165 
3166     if (!wrz) {
3167         status = nvme_check_mdts(n, mapped_size);
3168         if (status) {
3169             goto invalid;
3170         }
3171     }
3172 
3173     status = nvme_check_bounds(ns, slba, nlb);
3174     if (status) {
3175         goto invalid;
3176     }
3177 
3178     if (ns->params.zoned) {
3179         zone = nvme_get_zone_by_slba(ns, slba);
3180         assert(zone);
3181 
3182         if (append) {
3183             bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3184 
3185             if (unlikely(slba != zone->d.zslba)) {
3186                 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3187                 status = NVME_INVALID_FIELD;
3188                 goto invalid;
3189             }
3190 
3191             if (n->params.zasl &&
3192                 data_size > (uint64_t)n->page_size << n->params.zasl) {
3193                 trace_pci_nvme_err_zasl(data_size);
3194                 return NVME_INVALID_FIELD | NVME_DNR;
3195             }
3196 
3197             slba = zone->w_ptr;
3198             rw->slba = cpu_to_le64(slba);
3199             res->slba = cpu_to_le64(slba);
3200 
3201             switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3202             case NVME_ID_NS_DPS_TYPE_1:
3203                 if (!piremap) {
3204                     return NVME_INVALID_PROT_INFO | NVME_DNR;
3205                 }
3206 
3207                 /* fallthrough */
3208 
3209             case NVME_ID_NS_DPS_TYPE_2:
3210                 if (piremap) {
3211                     uint32_t reftag = le32_to_cpu(rw->reftag);
3212                     rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3213                 }
3214 
3215                 break;
3216 
3217             case NVME_ID_NS_DPS_TYPE_3:
3218                 if (piremap) {
3219                     return NVME_INVALID_PROT_INFO | NVME_DNR;
3220                 }
3221 
3222                 break;
3223             }
3224         }
3225 
3226         status = nvme_check_zone_write(ns, zone, slba, nlb);
3227         if (status) {
3228             goto invalid;
3229         }
3230 
3231         status = nvme_zrm_auto(n, ns, zone);
3232         if (status) {
3233             goto invalid;
3234         }
3235 
3236         zone->w_ptr += nlb;
3237     }
3238 
3239     data_offset = nvme_l2b(ns, slba);
3240 
3241     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3242         return nvme_dif_rw(n, req);
3243     }
3244 
3245     if (!wrz) {
3246         status = nvme_map_data(n, nlb, req);
3247         if (status) {
3248             goto invalid;
3249         }
3250 
3251         block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3252                          BLOCK_ACCT_WRITE);
3253         nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3254     } else {
3255         req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3256                                            BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3257                                            req);
3258     }
3259 
3260     return NVME_NO_COMPLETE;
3261 
3262 invalid:
3263     block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3264     return status | NVME_DNR;
3265 }
3266 
3267 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3268 {
3269     return nvme_do_write(n, req, false, false);
3270 }
3271 
3272 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3273 {
3274     return nvme_do_write(n, req, false, true);
3275 }
3276 
3277 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3278 {
3279     return nvme_do_write(n, req, true, false);
3280 }
3281 
3282 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3283                                             uint64_t *slba, uint32_t *zone_idx)
3284 {
3285     uint32_t dw10 = le32_to_cpu(c->cdw10);
3286     uint32_t dw11 = le32_to_cpu(c->cdw11);
3287 
3288     if (!ns->params.zoned) {
3289         trace_pci_nvme_err_invalid_opc(c->opcode);
3290         return NVME_INVALID_OPCODE | NVME_DNR;
3291     }
3292 
3293     *slba = ((uint64_t)dw11) << 32 | dw10;
3294     if (unlikely(*slba >= ns->id_ns.nsze)) {
3295         trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3296         *slba = 0;
3297         return NVME_LBA_RANGE | NVME_DNR;
3298     }
3299 
3300     *zone_idx = nvme_zone_idx(ns, *slba);
3301     assert(*zone_idx < ns->num_zones);
3302 
3303     return NVME_SUCCESS;
3304 }
3305 
3306 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3307                                  NvmeRequest *);
3308 
3309 enum NvmeZoneProcessingMask {
3310     NVME_PROC_CURRENT_ZONE    = 0,
3311     NVME_PROC_OPENED_ZONES    = 1 << 0,
3312     NVME_PROC_CLOSED_ZONES    = 1 << 1,
3313     NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3314     NVME_PROC_FULL_ZONES      = 1 << 3,
3315 };
3316 
3317 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3318                                NvmeZoneState state, NvmeRequest *req)
3319 {
3320     return nvme_zrm_open(nvme_ctrl(req), ns, zone);
3321 }
3322 
3323 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3324                                 NvmeZoneState state, NvmeRequest *req)
3325 {
3326     return nvme_zrm_close(ns, zone);
3327 }
3328 
3329 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3330                                  NvmeZoneState state, NvmeRequest *req)
3331 {
3332     return nvme_zrm_finish(ns, zone);
3333 }
3334 
3335 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3336                                   NvmeZoneState state, NvmeRequest *req)
3337 {
3338     switch (state) {
3339     case NVME_ZONE_STATE_READ_ONLY:
3340         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3341         /* fall through */
3342     case NVME_ZONE_STATE_OFFLINE:
3343         return NVME_SUCCESS;
3344     default:
3345         return NVME_ZONE_INVAL_TRANSITION;
3346     }
3347 }
3348 
3349 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3350 {
3351     uint16_t status;
3352     uint8_t state = nvme_get_zone_state(zone);
3353 
3354     if (state == NVME_ZONE_STATE_EMPTY) {
3355         status = nvme_aor_check(ns, 1, 0);
3356         if (status) {
3357             return status;
3358         }
3359         nvme_aor_inc_active(ns);
3360         zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3361         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3362         return NVME_SUCCESS;
3363     }
3364 
3365     return NVME_ZONE_INVAL_TRANSITION;
3366 }
3367 
3368 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3369                                     enum NvmeZoneProcessingMask proc_mask,
3370                                     op_handler_t op_hndlr, NvmeRequest *req)
3371 {
3372     uint16_t status = NVME_SUCCESS;
3373     NvmeZoneState zs = nvme_get_zone_state(zone);
3374     bool proc_zone;
3375 
3376     switch (zs) {
3377     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3378     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3379         proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3380         break;
3381     case NVME_ZONE_STATE_CLOSED:
3382         proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3383         break;
3384     case NVME_ZONE_STATE_READ_ONLY:
3385         proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3386         break;
3387     case NVME_ZONE_STATE_FULL:
3388         proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3389         break;
3390     default:
3391         proc_zone = false;
3392     }
3393 
3394     if (proc_zone) {
3395         status = op_hndlr(ns, zone, zs, req);
3396     }
3397 
3398     return status;
3399 }
3400 
3401 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3402                                 enum NvmeZoneProcessingMask proc_mask,
3403                                 op_handler_t op_hndlr, NvmeRequest *req)
3404 {
3405     NvmeZone *next;
3406     uint16_t status = NVME_SUCCESS;
3407     int i;
3408 
3409     if (!proc_mask) {
3410         status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3411     } else {
3412         if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3413             QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3414                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3415                                              req);
3416                 if (status && status != NVME_NO_COMPLETE) {
3417                     goto out;
3418                 }
3419             }
3420         }
3421         if (proc_mask & NVME_PROC_OPENED_ZONES) {
3422             QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3423                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3424                                              req);
3425                 if (status && status != NVME_NO_COMPLETE) {
3426                     goto out;
3427                 }
3428             }
3429 
3430             QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3431                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3432                                              req);
3433                 if (status && status != NVME_NO_COMPLETE) {
3434                     goto out;
3435                 }
3436             }
3437         }
3438         if (proc_mask & NVME_PROC_FULL_ZONES) {
3439             QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3440                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3441                                              req);
3442                 if (status && status != NVME_NO_COMPLETE) {
3443                     goto out;
3444                 }
3445             }
3446         }
3447 
3448         if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3449             for (i = 0; i < ns->num_zones; i++, zone++) {
3450                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3451                                              req);
3452                 if (status && status != NVME_NO_COMPLETE) {
3453                     goto out;
3454                 }
3455             }
3456         }
3457     }
3458 
3459 out:
3460     return status;
3461 }
3462 
3463 typedef struct NvmeZoneResetAIOCB {
3464     BlockAIOCB common;
3465     BlockAIOCB *aiocb;
3466     NvmeRequest *req;
3467     QEMUBH *bh;
3468     int ret;
3469 
3470     bool all;
3471     int idx;
3472     NvmeZone *zone;
3473 } NvmeZoneResetAIOCB;
3474 
3475 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
3476 {
3477     NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
3478     NvmeRequest *req = iocb->req;
3479     NvmeNamespace *ns = req->ns;
3480 
3481     iocb->idx = ns->num_zones;
3482 
3483     iocb->ret = -ECANCELED;
3484 
3485     if (iocb->aiocb) {
3486         blk_aio_cancel_async(iocb->aiocb);
3487         iocb->aiocb = NULL;
3488     }
3489 }
3490 
3491 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
3492     .aiocb_size = sizeof(NvmeZoneResetAIOCB),
3493     .cancel_async = nvme_zone_reset_cancel,
3494 };
3495 
3496 static void nvme_zone_reset_bh(void *opaque)
3497 {
3498     NvmeZoneResetAIOCB *iocb = opaque;
3499 
3500     iocb->common.cb(iocb->common.opaque, iocb->ret);
3501 
3502     qemu_bh_delete(iocb->bh);
3503     iocb->bh = NULL;
3504     qemu_aio_unref(iocb);
3505 }
3506 
3507 static void nvme_zone_reset_cb(void *opaque, int ret);
3508 
3509 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
3510 {
3511     NvmeZoneResetAIOCB *iocb = opaque;
3512     NvmeRequest *req = iocb->req;
3513     NvmeNamespace *ns = req->ns;
3514     int64_t moff;
3515     int count;
3516 
3517     if (ret < 0) {
3518         nvme_zone_reset_cb(iocb, ret);
3519         return;
3520     }
3521 
3522     if (!ns->lbaf.ms) {
3523         nvme_zone_reset_cb(iocb, 0);
3524         return;
3525     }
3526 
3527     moff = nvme_moff(ns, iocb->zone->d.zslba);
3528     count = nvme_m2b(ns, ns->zone_size);
3529 
3530     iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
3531                                         BDRV_REQ_MAY_UNMAP,
3532                                         nvme_zone_reset_cb, iocb);
3533     return;
3534 }
3535 
3536 static void nvme_zone_reset_cb(void *opaque, int ret)
3537 {
3538     NvmeZoneResetAIOCB *iocb = opaque;
3539     NvmeRequest *req = iocb->req;
3540     NvmeNamespace *ns = req->ns;
3541 
3542     if (ret < 0) {
3543         iocb->ret = ret;
3544         goto done;
3545     }
3546 
3547     if (iocb->zone) {
3548         nvme_zrm_reset(ns, iocb->zone);
3549 
3550         if (!iocb->all) {
3551             goto done;
3552         }
3553     }
3554 
3555     while (iocb->idx < ns->num_zones) {
3556         NvmeZone *zone = &ns->zone_array[iocb->idx++];
3557 
3558         switch (nvme_get_zone_state(zone)) {
3559         case NVME_ZONE_STATE_EMPTY:
3560             if (!iocb->all) {
3561                 goto done;
3562             }
3563 
3564             continue;
3565 
3566         case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3567         case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3568         case NVME_ZONE_STATE_CLOSED:
3569         case NVME_ZONE_STATE_FULL:
3570             iocb->zone = zone;
3571             break;
3572 
3573         default:
3574             continue;
3575         }
3576 
3577         trace_pci_nvme_zns_zone_reset(zone->d.zslba);
3578 
3579         iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
3580                                             nvme_l2b(ns, zone->d.zslba),
3581                                             nvme_l2b(ns, ns->zone_size),
3582                                             BDRV_REQ_MAY_UNMAP,
3583                                             nvme_zone_reset_epilogue_cb,
3584                                             iocb);
3585         return;
3586     }
3587 
3588 done:
3589     iocb->aiocb = NULL;
3590     if (iocb->bh) {
3591         qemu_bh_schedule(iocb->bh);
3592     }
3593 }
3594 
3595 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3596 {
3597     NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3598     NvmeNamespace *ns = req->ns;
3599     NvmeZone *zone;
3600     NvmeZoneResetAIOCB *iocb;
3601     uint8_t *zd_ext;
3602     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3603     uint64_t slba = 0;
3604     uint32_t zone_idx = 0;
3605     uint16_t status;
3606     uint8_t action;
3607     bool all;
3608     enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3609 
3610     action = dw13 & 0xff;
3611     all = !!(dw13 & 0x100);
3612 
3613     req->status = NVME_SUCCESS;
3614 
3615     if (!all) {
3616         status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3617         if (status) {
3618             return status;
3619         }
3620     }
3621 
3622     zone = &ns->zone_array[zone_idx];
3623     if (slba != zone->d.zslba) {
3624         trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3625         return NVME_INVALID_FIELD | NVME_DNR;
3626     }
3627 
3628     switch (action) {
3629 
3630     case NVME_ZONE_ACTION_OPEN:
3631         if (all) {
3632             proc_mask = NVME_PROC_CLOSED_ZONES;
3633         }
3634         trace_pci_nvme_open_zone(slba, zone_idx, all);
3635         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3636         break;
3637 
3638     case NVME_ZONE_ACTION_CLOSE:
3639         if (all) {
3640             proc_mask = NVME_PROC_OPENED_ZONES;
3641         }
3642         trace_pci_nvme_close_zone(slba, zone_idx, all);
3643         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3644         break;
3645 
3646     case NVME_ZONE_ACTION_FINISH:
3647         if (all) {
3648             proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3649         }
3650         trace_pci_nvme_finish_zone(slba, zone_idx, all);
3651         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3652         break;
3653 
3654     case NVME_ZONE_ACTION_RESET:
3655         trace_pci_nvme_reset_zone(slba, zone_idx, all);
3656 
3657         iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
3658                            nvme_misc_cb, req);
3659 
3660         iocb->req = req;
3661         iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb);
3662         iocb->ret = 0;
3663         iocb->all = all;
3664         iocb->idx = zone_idx;
3665         iocb->zone = NULL;
3666 
3667         req->aiocb = &iocb->common;
3668         nvme_zone_reset_cb(iocb, 0);
3669 
3670         return NVME_NO_COMPLETE;
3671 
3672     case NVME_ZONE_ACTION_OFFLINE:
3673         if (all) {
3674             proc_mask = NVME_PROC_READ_ONLY_ZONES;
3675         }
3676         trace_pci_nvme_offline_zone(slba, zone_idx, all);
3677         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3678         break;
3679 
3680     case NVME_ZONE_ACTION_SET_ZD_EXT:
3681         trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3682         if (all || !ns->params.zd_extension_size) {
3683             return NVME_INVALID_FIELD | NVME_DNR;
3684         }
3685         zd_ext = nvme_get_zd_extension(ns, zone_idx);
3686         status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3687         if (status) {
3688             trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3689             return status;
3690         }
3691 
3692         status = nvme_set_zd_ext(ns, zone);
3693         if (status == NVME_SUCCESS) {
3694             trace_pci_nvme_zd_extension_set(zone_idx);
3695             return status;
3696         }
3697         break;
3698 
3699     default:
3700         trace_pci_nvme_err_invalid_mgmt_action(action);
3701         status = NVME_INVALID_FIELD;
3702     }
3703 
3704     if (status == NVME_ZONE_INVAL_TRANSITION) {
3705         trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
3706                                                          zone->d.za);
3707     }
3708     if (status) {
3709         status |= NVME_DNR;
3710     }
3711 
3712     return status;
3713 }
3714 
3715 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
3716 {
3717     NvmeZoneState zs = nvme_get_zone_state(zl);
3718 
3719     switch (zafs) {
3720     case NVME_ZONE_REPORT_ALL:
3721         return true;
3722     case NVME_ZONE_REPORT_EMPTY:
3723         return zs == NVME_ZONE_STATE_EMPTY;
3724     case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
3725         return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
3726     case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
3727         return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
3728     case NVME_ZONE_REPORT_CLOSED:
3729         return zs == NVME_ZONE_STATE_CLOSED;
3730     case NVME_ZONE_REPORT_FULL:
3731         return zs == NVME_ZONE_STATE_FULL;
3732     case NVME_ZONE_REPORT_READ_ONLY:
3733         return zs == NVME_ZONE_STATE_READ_ONLY;
3734     case NVME_ZONE_REPORT_OFFLINE:
3735         return zs == NVME_ZONE_STATE_OFFLINE;
3736     default:
3737         return false;
3738     }
3739 }
3740 
3741 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
3742 {
3743     NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3744     NvmeNamespace *ns = req->ns;
3745     /* cdw12 is zero-based number of dwords to return. Convert to bytes */
3746     uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
3747     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3748     uint32_t zone_idx, zra, zrasf, partial;
3749     uint64_t max_zones, nr_zones = 0;
3750     uint16_t status;
3751     uint64_t slba;
3752     NvmeZoneDescr *z;
3753     NvmeZone *zone;
3754     NvmeZoneReportHeader *header;
3755     void *buf, *buf_p;
3756     size_t zone_entry_sz;
3757     int i;
3758 
3759     req->status = NVME_SUCCESS;
3760 
3761     status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3762     if (status) {
3763         return status;
3764     }
3765 
3766     zra = dw13 & 0xff;
3767     if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
3768         return NVME_INVALID_FIELD | NVME_DNR;
3769     }
3770     if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
3771         return NVME_INVALID_FIELD | NVME_DNR;
3772     }
3773 
3774     zrasf = (dw13 >> 8) & 0xff;
3775     if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
3776         return NVME_INVALID_FIELD | NVME_DNR;
3777     }
3778 
3779     if (data_size < sizeof(NvmeZoneReportHeader)) {
3780         return NVME_INVALID_FIELD | NVME_DNR;
3781     }
3782 
3783     status = nvme_check_mdts(n, data_size);
3784     if (status) {
3785         return status;
3786     }
3787 
3788     partial = (dw13 >> 16) & 0x01;
3789 
3790     zone_entry_sz = sizeof(NvmeZoneDescr);
3791     if (zra == NVME_ZONE_REPORT_EXTENDED) {
3792         zone_entry_sz += ns->params.zd_extension_size;
3793     }
3794 
3795     max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
3796     buf = g_malloc0(data_size);
3797 
3798     zone = &ns->zone_array[zone_idx];
3799     for (i = zone_idx; i < ns->num_zones; i++) {
3800         if (partial && nr_zones >= max_zones) {
3801             break;
3802         }
3803         if (nvme_zone_matches_filter(zrasf, zone++)) {
3804             nr_zones++;
3805         }
3806     }
3807     header = (NvmeZoneReportHeader *)buf;
3808     header->nr_zones = cpu_to_le64(nr_zones);
3809 
3810     buf_p = buf + sizeof(NvmeZoneReportHeader);
3811     for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
3812         zone = &ns->zone_array[zone_idx];
3813         if (nvme_zone_matches_filter(zrasf, zone)) {
3814             z = (NvmeZoneDescr *)buf_p;
3815             buf_p += sizeof(NvmeZoneDescr);
3816 
3817             z->zt = zone->d.zt;
3818             z->zs = zone->d.zs;
3819             z->zcap = cpu_to_le64(zone->d.zcap);
3820             z->zslba = cpu_to_le64(zone->d.zslba);
3821             z->za = zone->d.za;
3822 
3823             if (nvme_wp_is_valid(zone)) {
3824                 z->wp = cpu_to_le64(zone->d.wp);
3825             } else {
3826                 z->wp = cpu_to_le64(~0ULL);
3827             }
3828 
3829             if (zra == NVME_ZONE_REPORT_EXTENDED) {
3830                 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
3831                     memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
3832                            ns->params.zd_extension_size);
3833                 }
3834                 buf_p += ns->params.zd_extension_size;
3835             }
3836 
3837             max_zones--;
3838         }
3839     }
3840 
3841     status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
3842 
3843     g_free(buf);
3844 
3845     return status;
3846 }
3847 
3848 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
3849 {
3850     NvmeNamespace *ns;
3851     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3852 
3853     trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
3854                           req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
3855 
3856     if (!nvme_nsid_valid(n, nsid)) {
3857         return NVME_INVALID_NSID | NVME_DNR;
3858     }
3859 
3860     /*
3861      * In the base NVM command set, Flush may apply to all namespaces
3862      * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
3863      * along with TP 4056 (Namespace Types), it may be pretty screwed up.
3864      *
3865      * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
3866      * opcode with a specific command since we cannot determine a unique I/O
3867      * command set. Opcode 0h could have any other meaning than something
3868      * equivalent to flushing and say it DOES have completely different
3869      * semantics in some other command set - does an NSID of FFFFFFFFh then
3870      * mean "for all namespaces, apply whatever command set specific command
3871      * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
3872      * whatever command that uses the 0h opcode if, and only if, it allows NSID
3873      * to be FFFFFFFFh"?
3874      *
3875      * Anyway (and luckily), for now, we do not care about this since the
3876      * device only supports namespace types that includes the NVM Flush command
3877      * (NVM and Zoned), so always do an NVM Flush.
3878      */
3879     if (req->cmd.opcode == NVME_CMD_FLUSH) {
3880         return nvme_flush(n, req);
3881     }
3882 
3883     ns = nvme_ns(n, nsid);
3884     if (unlikely(!ns)) {
3885         return NVME_INVALID_FIELD | NVME_DNR;
3886     }
3887 
3888     if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
3889         trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
3890         return NVME_INVALID_OPCODE | NVME_DNR;
3891     }
3892 
3893     if (ns->status) {
3894         return ns->status;
3895     }
3896 
3897     if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
3898         return NVME_INVALID_FIELD;
3899     }
3900 
3901     req->ns = ns;
3902 
3903     switch (req->cmd.opcode) {
3904     case NVME_CMD_WRITE_ZEROES:
3905         return nvme_write_zeroes(n, req);
3906     case NVME_CMD_ZONE_APPEND:
3907         return nvme_zone_append(n, req);
3908     case NVME_CMD_WRITE:
3909         return nvme_write(n, req);
3910     case NVME_CMD_READ:
3911         return nvme_read(n, req);
3912     case NVME_CMD_COMPARE:
3913         return nvme_compare(n, req);
3914     case NVME_CMD_DSM:
3915         return nvme_dsm(n, req);
3916     case NVME_CMD_VERIFY:
3917         return nvme_verify(n, req);
3918     case NVME_CMD_COPY:
3919         return nvme_copy(n, req);
3920     case NVME_CMD_ZONE_MGMT_SEND:
3921         return nvme_zone_mgmt_send(n, req);
3922     case NVME_CMD_ZONE_MGMT_RECV:
3923         return nvme_zone_mgmt_recv(n, req);
3924     default:
3925         assert(false);
3926     }
3927 
3928     return NVME_INVALID_OPCODE | NVME_DNR;
3929 }
3930 
3931 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
3932 {
3933     n->sq[sq->sqid] = NULL;
3934     timer_free(sq->timer);
3935     g_free(sq->io_req);
3936     if (sq->sqid) {
3937         g_free(sq);
3938     }
3939 }
3940 
3941 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
3942 {
3943     NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
3944     NvmeRequest *r, *next;
3945     NvmeSQueue *sq;
3946     NvmeCQueue *cq;
3947     uint16_t qid = le16_to_cpu(c->qid);
3948 
3949     if (unlikely(!qid || nvme_check_sqid(n, qid))) {
3950         trace_pci_nvme_err_invalid_del_sq(qid);
3951         return NVME_INVALID_QID | NVME_DNR;
3952     }
3953 
3954     trace_pci_nvme_del_sq(qid);
3955 
3956     sq = n->sq[qid];
3957     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
3958         r = QTAILQ_FIRST(&sq->out_req_list);
3959         assert(r->aiocb);
3960         blk_aio_cancel(r->aiocb);
3961     }
3962 
3963     assert(QTAILQ_EMPTY(&sq->out_req_list));
3964 
3965     if (!nvme_check_cqid(n, sq->cqid)) {
3966         cq = n->cq[sq->cqid];
3967         QTAILQ_REMOVE(&cq->sq_list, sq, entry);
3968 
3969         nvme_post_cqes(cq);
3970         QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
3971             if (r->sq == sq) {
3972                 QTAILQ_REMOVE(&cq->req_list, r, entry);
3973                 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
3974             }
3975         }
3976     }
3977 
3978     nvme_free_sq(sq, n);
3979     return NVME_SUCCESS;
3980 }
3981 
3982 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
3983                          uint16_t sqid, uint16_t cqid, uint16_t size)
3984 {
3985     int i;
3986     NvmeCQueue *cq;
3987 
3988     sq->ctrl = n;
3989     sq->dma_addr = dma_addr;
3990     sq->sqid = sqid;
3991     sq->size = size;
3992     sq->cqid = cqid;
3993     sq->head = sq->tail = 0;
3994     sq->io_req = g_new0(NvmeRequest, sq->size);
3995 
3996     QTAILQ_INIT(&sq->req_list);
3997     QTAILQ_INIT(&sq->out_req_list);
3998     for (i = 0; i < sq->size; i++) {
3999         sq->io_req[i].sq = sq;
4000         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4001     }
4002     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
4003 
4004     assert(n->cq[cqid]);
4005     cq = n->cq[cqid];
4006     QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4007     n->sq[sqid] = sq;
4008 }
4009 
4010 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4011 {
4012     NvmeSQueue *sq;
4013     NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4014 
4015     uint16_t cqid = le16_to_cpu(c->cqid);
4016     uint16_t sqid = le16_to_cpu(c->sqid);
4017     uint16_t qsize = le16_to_cpu(c->qsize);
4018     uint16_t qflags = le16_to_cpu(c->sq_flags);
4019     uint64_t prp1 = le64_to_cpu(c->prp1);
4020 
4021     trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4022 
4023     if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4024         trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4025         return NVME_INVALID_CQID | NVME_DNR;
4026     }
4027     if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
4028         n->sq[sqid] != NULL)) {
4029         trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4030         return NVME_INVALID_QID | NVME_DNR;
4031     }
4032     if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4033         trace_pci_nvme_err_invalid_create_sq_size(qsize);
4034         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4035     }
4036     if (unlikely(prp1 & (n->page_size - 1))) {
4037         trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4038         return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4039     }
4040     if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4041         trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4042         return NVME_INVALID_FIELD | NVME_DNR;
4043     }
4044     sq = g_malloc0(sizeof(*sq));
4045     nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4046     return NVME_SUCCESS;
4047 }
4048 
4049 struct nvme_stats {
4050     uint64_t units_read;
4051     uint64_t units_written;
4052     uint64_t read_commands;
4053     uint64_t write_commands;
4054 };
4055 
4056 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4057 {
4058     BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4059 
4060     stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
4061     stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
4062     stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4063     stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4064 }
4065 
4066 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4067                                 uint64_t off, NvmeRequest *req)
4068 {
4069     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4070     struct nvme_stats stats = { 0 };
4071     NvmeSmartLog smart = { 0 };
4072     uint32_t trans_len;
4073     NvmeNamespace *ns;
4074     time_t current_ms;
4075 
4076     if (off >= sizeof(smart)) {
4077         return NVME_INVALID_FIELD | NVME_DNR;
4078     }
4079 
4080     if (nsid != 0xffffffff) {
4081         ns = nvme_ns(n, nsid);
4082         if (!ns) {
4083             return NVME_INVALID_NSID | NVME_DNR;
4084         }
4085         nvme_set_blk_stats(ns, &stats);
4086     } else {
4087         int i;
4088 
4089         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4090             ns = nvme_ns(n, i);
4091             if (!ns) {
4092                 continue;
4093             }
4094             nvme_set_blk_stats(ns, &stats);
4095         }
4096     }
4097 
4098     trans_len = MIN(sizeof(smart) - off, buf_len);
4099     smart.critical_warning = n->smart_critical_warning;
4100 
4101     smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
4102                                                         1000));
4103     smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
4104                                                            1000));
4105     smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4106     smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4107 
4108     smart.temperature = cpu_to_le16(n->temperature);
4109 
4110     if ((n->temperature >= n->features.temp_thresh_hi) ||
4111         (n->temperature <= n->features.temp_thresh_low)) {
4112         smart.critical_warning |= NVME_SMART_TEMPERATURE;
4113     }
4114 
4115     current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4116     smart.power_on_hours[0] =
4117         cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4118 
4119     if (!rae) {
4120         nvme_clear_events(n, NVME_AER_TYPE_SMART);
4121     }
4122 
4123     return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4124 }
4125 
4126 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4127                                  NvmeRequest *req)
4128 {
4129     uint32_t trans_len;
4130     NvmeFwSlotInfoLog fw_log = {
4131         .afi = 0x1,
4132     };
4133 
4134     if (off >= sizeof(fw_log)) {
4135         return NVME_INVALID_FIELD | NVME_DNR;
4136     }
4137 
4138     strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
4139     trans_len = MIN(sizeof(fw_log) - off, buf_len);
4140 
4141     return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
4142 }
4143 
4144 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4145                                 uint64_t off, NvmeRequest *req)
4146 {
4147     uint32_t trans_len;
4148     NvmeErrorLog errlog;
4149 
4150     if (off >= sizeof(errlog)) {
4151         return NVME_INVALID_FIELD | NVME_DNR;
4152     }
4153 
4154     if (!rae) {
4155         nvme_clear_events(n, NVME_AER_TYPE_ERROR);
4156     }
4157 
4158     memset(&errlog, 0x0, sizeof(errlog));
4159     trans_len = MIN(sizeof(errlog) - off, buf_len);
4160 
4161     return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
4162 }
4163 
4164 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4165                                     uint64_t off, NvmeRequest *req)
4166 {
4167     uint32_t nslist[1024];
4168     uint32_t trans_len;
4169     int i = 0;
4170     uint32_t nsid;
4171 
4172     if (off >= sizeof(nslist)) {
4173         trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
4174         return NVME_INVALID_FIELD | NVME_DNR;
4175     }
4176 
4177     memset(nslist, 0x0, sizeof(nslist));
4178     trans_len = MIN(sizeof(nslist) - off, buf_len);
4179 
4180     while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
4181             NVME_CHANGED_NSID_SIZE) {
4182         /*
4183          * If more than 1024 namespaces, the first entry in the log page should
4184          * be set to FFFFFFFFh and the others to 0 as spec.
4185          */
4186         if (i == ARRAY_SIZE(nslist)) {
4187             memset(nslist, 0x0, sizeof(nslist));
4188             nslist[0] = 0xffffffff;
4189             break;
4190         }
4191 
4192         nslist[i++] = nsid;
4193         clear_bit(nsid, n->changed_nsids);
4194     }
4195 
4196     /*
4197      * Remove all the remaining list entries in case returns directly due to
4198      * more than 1024 namespaces.
4199      */
4200     if (nslist[0] == 0xffffffff) {
4201         bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
4202     }
4203 
4204     if (!rae) {
4205         nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
4206     }
4207 
4208     return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
4209 }
4210 
4211 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
4212                                  uint64_t off, NvmeRequest *req)
4213 {
4214     NvmeEffectsLog log = {};
4215     const uint32_t *src_iocs = NULL;
4216     uint32_t trans_len;
4217 
4218     if (off >= sizeof(log)) {
4219         trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
4220         return NVME_INVALID_FIELD | NVME_DNR;
4221     }
4222 
4223     switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
4224     case NVME_CC_CSS_NVM:
4225         src_iocs = nvme_cse_iocs_nvm;
4226         /* fall through */
4227     case NVME_CC_CSS_ADMIN_ONLY:
4228         break;
4229     case NVME_CC_CSS_CSI:
4230         switch (csi) {
4231         case NVME_CSI_NVM:
4232             src_iocs = nvme_cse_iocs_nvm;
4233             break;
4234         case NVME_CSI_ZONED:
4235             src_iocs = nvme_cse_iocs_zoned;
4236             break;
4237         }
4238     }
4239 
4240     memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4241 
4242     if (src_iocs) {
4243         memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4244     }
4245 
4246     trans_len = MIN(sizeof(log) - off, buf_len);
4247 
4248     return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4249 }
4250 
4251 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4252 {
4253     NvmeCmd *cmd = &req->cmd;
4254 
4255     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4256     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4257     uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4258     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4259     uint8_t  lid = dw10 & 0xff;
4260     uint8_t  lsp = (dw10 >> 8) & 0xf;
4261     uint8_t  rae = (dw10 >> 15) & 0x1;
4262     uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
4263     uint32_t numdl, numdu;
4264     uint64_t off, lpol, lpou;
4265     size_t   len;
4266     uint16_t status;
4267 
4268     numdl = (dw10 >> 16);
4269     numdu = (dw11 & 0xffff);
4270     lpol = dw12;
4271     lpou = dw13;
4272 
4273     len = (((numdu << 16) | numdl) + 1) << 2;
4274     off = (lpou << 32ULL) | lpol;
4275 
4276     if (off & 0x3) {
4277         return NVME_INVALID_FIELD | NVME_DNR;
4278     }
4279 
4280     trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4281 
4282     status = nvme_check_mdts(n, len);
4283     if (status) {
4284         return status;
4285     }
4286 
4287     switch (lid) {
4288     case NVME_LOG_ERROR_INFO:
4289         return nvme_error_info(n, rae, len, off, req);
4290     case NVME_LOG_SMART_INFO:
4291         return nvme_smart_info(n, rae, len, off, req);
4292     case NVME_LOG_FW_SLOT_INFO:
4293         return nvme_fw_log_info(n, len, off, req);
4294     case NVME_LOG_CHANGED_NSLIST:
4295         return nvme_changed_nslist(n, rae, len, off, req);
4296     case NVME_LOG_CMD_EFFECTS:
4297         return nvme_cmd_effects(n, csi, len, off, req);
4298     default:
4299         trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4300         return NVME_INVALID_FIELD | NVME_DNR;
4301     }
4302 }
4303 
4304 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4305 {
4306     n->cq[cq->cqid] = NULL;
4307     timer_free(cq->timer);
4308     if (msix_enabled(&n->parent_obj)) {
4309         msix_vector_unuse(&n->parent_obj, cq->vector);
4310     }
4311     if (cq->cqid) {
4312         g_free(cq);
4313     }
4314 }
4315 
4316 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4317 {
4318     NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4319     NvmeCQueue *cq;
4320     uint16_t qid = le16_to_cpu(c->qid);
4321 
4322     if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4323         trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4324         return NVME_INVALID_CQID | NVME_DNR;
4325     }
4326 
4327     cq = n->cq[qid];
4328     if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4329         trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4330         return NVME_INVALID_QUEUE_DEL;
4331     }
4332 
4333     if (cq->irq_enabled && cq->tail != cq->head) {
4334         n->cq_pending--;
4335     }
4336 
4337     nvme_irq_deassert(n, cq);
4338     trace_pci_nvme_del_cq(qid);
4339     nvme_free_cq(cq, n);
4340     return NVME_SUCCESS;
4341 }
4342 
4343 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4344                          uint16_t cqid, uint16_t vector, uint16_t size,
4345                          uint16_t irq_enabled)
4346 {
4347     int ret;
4348 
4349     if (msix_enabled(&n->parent_obj)) {
4350         ret = msix_vector_use(&n->parent_obj, vector);
4351         assert(ret == 0);
4352     }
4353     cq->ctrl = n;
4354     cq->cqid = cqid;
4355     cq->size = size;
4356     cq->dma_addr = dma_addr;
4357     cq->phase = 1;
4358     cq->irq_enabled = irq_enabled;
4359     cq->vector = vector;
4360     cq->head = cq->tail = 0;
4361     QTAILQ_INIT(&cq->req_list);
4362     QTAILQ_INIT(&cq->sq_list);
4363     n->cq[cqid] = cq;
4364     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4365 }
4366 
4367 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4368 {
4369     NvmeCQueue *cq;
4370     NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4371     uint16_t cqid = le16_to_cpu(c->cqid);
4372     uint16_t vector = le16_to_cpu(c->irq_vector);
4373     uint16_t qsize = le16_to_cpu(c->qsize);
4374     uint16_t qflags = le16_to_cpu(c->cq_flags);
4375     uint64_t prp1 = le64_to_cpu(c->prp1);
4376 
4377     trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4378                              NVME_CQ_FLAGS_IEN(qflags) != 0);
4379 
4380     if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
4381         n->cq[cqid] != NULL)) {
4382         trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4383         return NVME_INVALID_QID | NVME_DNR;
4384     }
4385     if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4386         trace_pci_nvme_err_invalid_create_cq_size(qsize);
4387         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4388     }
4389     if (unlikely(prp1 & (n->page_size - 1))) {
4390         trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4391         return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4392     }
4393     if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4394         trace_pci_nvme_err_invalid_create_cq_vector(vector);
4395         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4396     }
4397     if (unlikely(vector >= n->params.msix_qsize)) {
4398         trace_pci_nvme_err_invalid_create_cq_vector(vector);
4399         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4400     }
4401     if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4402         trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4403         return NVME_INVALID_FIELD | NVME_DNR;
4404     }
4405 
4406     cq = g_malloc0(sizeof(*cq));
4407     nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4408                  NVME_CQ_FLAGS_IEN(qflags));
4409 
4410     /*
4411      * It is only required to set qs_created when creating a completion queue;
4412      * creating a submission queue without a matching completion queue will
4413      * fail.
4414      */
4415     n->qs_created = true;
4416     return NVME_SUCCESS;
4417 }
4418 
4419 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4420 {
4421     uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4422 
4423     return nvme_c2h(n, id, sizeof(id), req);
4424 }
4425 
4426 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4427 {
4428     trace_pci_nvme_identify_ctrl();
4429 
4430     return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4431 }
4432 
4433 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4434 {
4435     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4436     uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4437     NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4438 
4439     trace_pci_nvme_identify_ctrl_csi(c->csi);
4440 
4441     switch (c->csi) {
4442     case NVME_CSI_NVM:
4443         id_nvm->vsl = n->params.vsl;
4444         id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4445         break;
4446 
4447     case NVME_CSI_ZONED:
4448         ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4449         break;
4450 
4451     default:
4452         return NVME_INVALID_FIELD | NVME_DNR;
4453     }
4454 
4455     return nvme_c2h(n, id, sizeof(id), req);
4456 }
4457 
4458 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4459 {
4460     NvmeNamespace *ns;
4461     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4462     uint32_t nsid = le32_to_cpu(c->nsid);
4463 
4464     trace_pci_nvme_identify_ns(nsid);
4465 
4466     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4467         return NVME_INVALID_NSID | NVME_DNR;
4468     }
4469 
4470     ns = nvme_ns(n, nsid);
4471     if (unlikely(!ns)) {
4472         if (!active) {
4473             ns = nvme_subsys_ns(n->subsys, nsid);
4474             if (!ns) {
4475                 return nvme_rpt_empty_id_struct(n, req);
4476             }
4477         } else {
4478             return nvme_rpt_empty_id_struct(n, req);
4479         }
4480     }
4481 
4482     if (active || ns->csi == NVME_CSI_NVM) {
4483         return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4484     }
4485 
4486     return NVME_INVALID_CMD_SET | NVME_DNR;
4487 }
4488 
4489 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
4490                                         bool attached)
4491 {
4492     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4493     uint32_t nsid = le32_to_cpu(c->nsid);
4494     uint16_t min_id = le16_to_cpu(c->ctrlid);
4495     uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4496     uint16_t *ids = &list[1];
4497     NvmeNamespace *ns;
4498     NvmeCtrl *ctrl;
4499     int cntlid, nr_ids = 0;
4500 
4501     trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
4502 
4503     if (!n->subsys) {
4504         return NVME_INVALID_FIELD | NVME_DNR;
4505     }
4506 
4507     if (attached) {
4508         if (nsid == NVME_NSID_BROADCAST) {
4509             return NVME_INVALID_FIELD | NVME_DNR;
4510         }
4511 
4512         ns = nvme_subsys_ns(n->subsys, nsid);
4513         if (!ns) {
4514             return NVME_INVALID_FIELD | NVME_DNR;
4515         }
4516     }
4517 
4518     for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4519         ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4520         if (!ctrl) {
4521             continue;
4522         }
4523 
4524         if (attached && !nvme_ns(ctrl, nsid)) {
4525             continue;
4526         }
4527 
4528         ids[nr_ids++] = cntlid;
4529     }
4530 
4531     list[0] = nr_ids;
4532 
4533     return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4534 }
4535 
4536 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4537                                      bool active)
4538 {
4539     NvmeNamespace *ns;
4540     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4541     uint32_t nsid = le32_to_cpu(c->nsid);
4542 
4543     trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4544 
4545     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4546         return NVME_INVALID_NSID | NVME_DNR;
4547     }
4548 
4549     ns = nvme_ns(n, nsid);
4550     if (unlikely(!ns)) {
4551         if (!active) {
4552             ns = nvme_subsys_ns(n->subsys, nsid);
4553             if (!ns) {
4554                 return nvme_rpt_empty_id_struct(n, req);
4555             }
4556         } else {
4557             return nvme_rpt_empty_id_struct(n, req);
4558         }
4559     }
4560 
4561     if (c->csi == NVME_CSI_NVM) {
4562         return nvme_rpt_empty_id_struct(n, req);
4563     } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
4564         return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
4565                         req);
4566     }
4567 
4568     return NVME_INVALID_FIELD | NVME_DNR;
4569 }
4570 
4571 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
4572                                      bool active)
4573 {
4574     NvmeNamespace *ns;
4575     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4576     uint32_t min_nsid = le32_to_cpu(c->nsid);
4577     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4578     static const int data_len = sizeof(list);
4579     uint32_t *list_ptr = (uint32_t *)list;
4580     int i, j = 0;
4581 
4582     trace_pci_nvme_identify_nslist(min_nsid);
4583 
4584     /*
4585      * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
4586      * since the Active Namespace ID List should return namespaces with ids
4587      * *higher* than the NSID specified in the command. This is also specified
4588      * in the spec (NVM Express v1.3d, Section 5.15.4).
4589      */
4590     if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4591         return NVME_INVALID_NSID | NVME_DNR;
4592     }
4593 
4594     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4595         ns = nvme_ns(n, i);
4596         if (!ns) {
4597             if (!active) {
4598                 ns = nvme_subsys_ns(n->subsys, i);
4599                 if (!ns) {
4600                     continue;
4601                 }
4602             } else {
4603                 continue;
4604             }
4605         }
4606         if (ns->params.nsid <= min_nsid) {
4607             continue;
4608         }
4609         list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4610         if (j == data_len / sizeof(uint32_t)) {
4611             break;
4612         }
4613     }
4614 
4615     return nvme_c2h(n, list, data_len, req);
4616 }
4617 
4618 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
4619                                          bool active)
4620 {
4621     NvmeNamespace *ns;
4622     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4623     uint32_t min_nsid = le32_to_cpu(c->nsid);
4624     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4625     static const int data_len = sizeof(list);
4626     uint32_t *list_ptr = (uint32_t *)list;
4627     int i, j = 0;
4628 
4629     trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
4630 
4631     /*
4632      * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
4633      */
4634     if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4635         return NVME_INVALID_NSID | NVME_DNR;
4636     }
4637 
4638     if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
4639         return NVME_INVALID_FIELD | NVME_DNR;
4640     }
4641 
4642     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4643         ns = nvme_ns(n, i);
4644         if (!ns) {
4645             if (!active) {
4646                 ns = nvme_subsys_ns(n->subsys, i);
4647                 if (!ns) {
4648                     continue;
4649                 }
4650             } else {
4651                 continue;
4652             }
4653         }
4654         if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
4655             continue;
4656         }
4657         list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4658         if (j == data_len / sizeof(uint32_t)) {
4659             break;
4660         }
4661     }
4662 
4663     return nvme_c2h(n, list, data_len, req);
4664 }
4665 
4666 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
4667 {
4668     NvmeNamespace *ns;
4669     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4670     uint32_t nsid = le32_to_cpu(c->nsid);
4671     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4672     uint8_t *pos = list;
4673     struct {
4674         NvmeIdNsDescr hdr;
4675         uint8_t v[NVME_NIDL_UUID];
4676     } QEMU_PACKED uuid = {};
4677     struct {
4678         NvmeIdNsDescr hdr;
4679         uint64_t v;
4680     } QEMU_PACKED eui64 = {};
4681     struct {
4682         NvmeIdNsDescr hdr;
4683         uint8_t v;
4684     } QEMU_PACKED csi = {};
4685 
4686     trace_pci_nvme_identify_ns_descr_list(nsid);
4687 
4688     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4689         return NVME_INVALID_NSID | NVME_DNR;
4690     }
4691 
4692     ns = nvme_ns(n, nsid);
4693     if (unlikely(!ns)) {
4694         return NVME_INVALID_FIELD | NVME_DNR;
4695     }
4696 
4697     /*
4698      * If the EUI-64 field is 0 and the NGUID field is 0, the namespace must
4699      * provide a valid Namespace UUID in the Namespace Identification Descriptor
4700      * data structure. QEMU does not yet support setting NGUID.
4701      */
4702     uuid.hdr.nidt = NVME_NIDT_UUID;
4703     uuid.hdr.nidl = NVME_NIDL_UUID;
4704     memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
4705     memcpy(pos, &uuid, sizeof(uuid));
4706     pos += sizeof(uuid);
4707 
4708     if (ns->params.eui64) {
4709         eui64.hdr.nidt = NVME_NIDT_EUI64;
4710         eui64.hdr.nidl = NVME_NIDL_EUI64;
4711         eui64.v = cpu_to_be64(ns->params.eui64);
4712         memcpy(pos, &eui64, sizeof(eui64));
4713         pos += sizeof(eui64);
4714     }
4715 
4716     csi.hdr.nidt = NVME_NIDT_CSI;
4717     csi.hdr.nidl = NVME_NIDL_CSI;
4718     csi.v = ns->csi;
4719     memcpy(pos, &csi, sizeof(csi));
4720     pos += sizeof(csi);
4721 
4722     return nvme_c2h(n, list, sizeof(list), req);
4723 }
4724 
4725 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
4726 {
4727     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4728     static const int data_len = sizeof(list);
4729 
4730     trace_pci_nvme_identify_cmd_set();
4731 
4732     NVME_SET_CSI(*list, NVME_CSI_NVM);
4733     NVME_SET_CSI(*list, NVME_CSI_ZONED);
4734 
4735     return nvme_c2h(n, list, data_len, req);
4736 }
4737 
4738 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
4739 {
4740     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4741 
4742     trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
4743                             c->csi);
4744 
4745     switch (c->cns) {
4746     case NVME_ID_CNS_NS:
4747         return nvme_identify_ns(n, req, true);
4748     case NVME_ID_CNS_NS_PRESENT:
4749         return nvme_identify_ns(n, req, false);
4750     case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
4751         return nvme_identify_ctrl_list(n, req, true);
4752     case NVME_ID_CNS_CTRL_LIST:
4753         return nvme_identify_ctrl_list(n, req, false);
4754     case NVME_ID_CNS_CS_NS:
4755         return nvme_identify_ns_csi(n, req, true);
4756     case NVME_ID_CNS_CS_NS_PRESENT:
4757         return nvme_identify_ns_csi(n, req, false);
4758     case NVME_ID_CNS_CTRL:
4759         return nvme_identify_ctrl(n, req);
4760     case NVME_ID_CNS_CS_CTRL:
4761         return nvme_identify_ctrl_csi(n, req);
4762     case NVME_ID_CNS_NS_ACTIVE_LIST:
4763         return nvme_identify_nslist(n, req, true);
4764     case NVME_ID_CNS_NS_PRESENT_LIST:
4765         return nvme_identify_nslist(n, req, false);
4766     case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
4767         return nvme_identify_nslist_csi(n, req, true);
4768     case NVME_ID_CNS_CS_NS_PRESENT_LIST:
4769         return nvme_identify_nslist_csi(n, req, false);
4770     case NVME_ID_CNS_NS_DESCR_LIST:
4771         return nvme_identify_ns_descr_list(n, req);
4772     case NVME_ID_CNS_IO_COMMAND_SET:
4773         return nvme_identify_cmd_set(n, req);
4774     default:
4775         trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
4776         return NVME_INVALID_FIELD | NVME_DNR;
4777     }
4778 }
4779 
4780 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
4781 {
4782     uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
4783 
4784     req->cqe.result = 1;
4785     if (nvme_check_sqid(n, sqid)) {
4786         return NVME_INVALID_FIELD | NVME_DNR;
4787     }
4788 
4789     return NVME_SUCCESS;
4790 }
4791 
4792 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
4793 {
4794     trace_pci_nvme_setfeat_timestamp(ts);
4795 
4796     n->host_timestamp = le64_to_cpu(ts);
4797     n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4798 }
4799 
4800 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
4801 {
4802     uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4803     uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
4804 
4805     union nvme_timestamp {
4806         struct {
4807             uint64_t timestamp:48;
4808             uint64_t sync:1;
4809             uint64_t origin:3;
4810             uint64_t rsvd1:12;
4811         };
4812         uint64_t all;
4813     };
4814 
4815     union nvme_timestamp ts;
4816     ts.all = 0;
4817     ts.timestamp = n->host_timestamp + elapsed_time;
4818 
4819     /* If the host timestamp is non-zero, set the timestamp origin */
4820     ts.origin = n->host_timestamp ? 0x01 : 0x00;
4821 
4822     trace_pci_nvme_getfeat_timestamp(ts.all);
4823 
4824     return cpu_to_le64(ts.all);
4825 }
4826 
4827 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4828 {
4829     uint64_t timestamp = nvme_get_timestamp(n);
4830 
4831     return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
4832 }
4833 
4834 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
4835 {
4836     NvmeCmd *cmd = &req->cmd;
4837     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4838     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4839     uint32_t nsid = le32_to_cpu(cmd->nsid);
4840     uint32_t result;
4841     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4842     NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
4843     uint16_t iv;
4844     NvmeNamespace *ns;
4845     int i;
4846 
4847     static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
4848         [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
4849     };
4850 
4851     trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
4852 
4853     if (!nvme_feature_support[fid]) {
4854         return NVME_INVALID_FIELD | NVME_DNR;
4855     }
4856 
4857     if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4858         if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4859             /*
4860              * The Reservation Notification Mask and Reservation Persistence
4861              * features require a status code of Invalid Field in Command when
4862              * NSID is FFFFFFFFh. Since the device does not support those
4863              * features we can always return Invalid Namespace or Format as we
4864              * should do for all other features.
4865              */
4866             return NVME_INVALID_NSID | NVME_DNR;
4867         }
4868 
4869         if (!nvme_ns(n, nsid)) {
4870             return NVME_INVALID_FIELD | NVME_DNR;
4871         }
4872     }
4873 
4874     switch (sel) {
4875     case NVME_GETFEAT_SELECT_CURRENT:
4876         break;
4877     case NVME_GETFEAT_SELECT_SAVED:
4878         /* no features are saveable by the controller; fallthrough */
4879     case NVME_GETFEAT_SELECT_DEFAULT:
4880         goto defaults;
4881     case NVME_GETFEAT_SELECT_CAP:
4882         result = nvme_feature_cap[fid];
4883         goto out;
4884     }
4885 
4886     switch (fid) {
4887     case NVME_TEMPERATURE_THRESHOLD:
4888         result = 0;
4889 
4890         /*
4891          * The controller only implements the Composite Temperature sensor, so
4892          * return 0 for all other sensors.
4893          */
4894         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4895             goto out;
4896         }
4897 
4898         switch (NVME_TEMP_THSEL(dw11)) {
4899         case NVME_TEMP_THSEL_OVER:
4900             result = n->features.temp_thresh_hi;
4901             goto out;
4902         case NVME_TEMP_THSEL_UNDER:
4903             result = n->features.temp_thresh_low;
4904             goto out;
4905         }
4906 
4907         return NVME_INVALID_FIELD | NVME_DNR;
4908     case NVME_ERROR_RECOVERY:
4909         if (!nvme_nsid_valid(n, nsid)) {
4910             return NVME_INVALID_NSID | NVME_DNR;
4911         }
4912 
4913         ns = nvme_ns(n, nsid);
4914         if (unlikely(!ns)) {
4915             return NVME_INVALID_FIELD | NVME_DNR;
4916         }
4917 
4918         result = ns->features.err_rec;
4919         goto out;
4920     case NVME_VOLATILE_WRITE_CACHE:
4921         result = 0;
4922         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4923             ns = nvme_ns(n, i);
4924             if (!ns) {
4925                 continue;
4926             }
4927 
4928             result = blk_enable_write_cache(ns->blkconf.blk);
4929             if (result) {
4930                 break;
4931             }
4932         }
4933         trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
4934         goto out;
4935     case NVME_ASYNCHRONOUS_EVENT_CONF:
4936         result = n->features.async_config;
4937         goto out;
4938     case NVME_TIMESTAMP:
4939         return nvme_get_feature_timestamp(n, req);
4940     default:
4941         break;
4942     }
4943 
4944 defaults:
4945     switch (fid) {
4946     case NVME_TEMPERATURE_THRESHOLD:
4947         result = 0;
4948 
4949         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4950             break;
4951         }
4952 
4953         if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
4954             result = NVME_TEMPERATURE_WARNING;
4955         }
4956 
4957         break;
4958     case NVME_NUMBER_OF_QUEUES:
4959         result = (n->params.max_ioqpairs - 1) |
4960             ((n->params.max_ioqpairs - 1) << 16);
4961         trace_pci_nvme_getfeat_numq(result);
4962         break;
4963     case NVME_INTERRUPT_VECTOR_CONF:
4964         iv = dw11 & 0xffff;
4965         if (iv >= n->params.max_ioqpairs + 1) {
4966             return NVME_INVALID_FIELD | NVME_DNR;
4967         }
4968 
4969         result = iv;
4970         if (iv == n->admin_cq.vector) {
4971             result |= NVME_INTVC_NOCOALESCING;
4972         }
4973         break;
4974     default:
4975         result = nvme_feature_default[fid];
4976         break;
4977     }
4978 
4979 out:
4980     req->cqe.result = cpu_to_le32(result);
4981     return NVME_SUCCESS;
4982 }
4983 
4984 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4985 {
4986     uint16_t ret;
4987     uint64_t timestamp;
4988 
4989     ret = nvme_h2c(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
4990     if (ret) {
4991         return ret;
4992     }
4993 
4994     nvme_set_timestamp(n, timestamp);
4995 
4996     return NVME_SUCCESS;
4997 }
4998 
4999 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
5000 {
5001     NvmeNamespace *ns = NULL;
5002 
5003     NvmeCmd *cmd = &req->cmd;
5004     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5005     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5006     uint32_t nsid = le32_to_cpu(cmd->nsid);
5007     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
5008     uint8_t save = NVME_SETFEAT_SAVE(dw10);
5009     int i;
5010 
5011     trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
5012 
5013     if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
5014         return NVME_FID_NOT_SAVEABLE | NVME_DNR;
5015     }
5016 
5017     if (!nvme_feature_support[fid]) {
5018         return NVME_INVALID_FIELD | NVME_DNR;
5019     }
5020 
5021     if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5022         if (nsid != NVME_NSID_BROADCAST) {
5023             if (!nvme_nsid_valid(n, nsid)) {
5024                 return NVME_INVALID_NSID | NVME_DNR;
5025             }
5026 
5027             ns = nvme_ns(n, nsid);
5028             if (unlikely(!ns)) {
5029                 return NVME_INVALID_FIELD | NVME_DNR;
5030             }
5031         }
5032     } else if (nsid && nsid != NVME_NSID_BROADCAST) {
5033         if (!nvme_nsid_valid(n, nsid)) {
5034             return NVME_INVALID_NSID | NVME_DNR;
5035         }
5036 
5037         return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
5038     }
5039 
5040     if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
5041         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5042     }
5043 
5044     switch (fid) {
5045     case NVME_TEMPERATURE_THRESHOLD:
5046         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5047             break;
5048         }
5049 
5050         switch (NVME_TEMP_THSEL(dw11)) {
5051         case NVME_TEMP_THSEL_OVER:
5052             n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
5053             break;
5054         case NVME_TEMP_THSEL_UNDER:
5055             n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
5056             break;
5057         default:
5058             return NVME_INVALID_FIELD | NVME_DNR;
5059         }
5060 
5061         if ((n->temperature >= n->features.temp_thresh_hi) ||
5062             (n->temperature <= n->features.temp_thresh_low)) {
5063             nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
5064         }
5065 
5066         break;
5067     case NVME_ERROR_RECOVERY:
5068         if (nsid == NVME_NSID_BROADCAST) {
5069             for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5070                 ns = nvme_ns(n, i);
5071 
5072                 if (!ns) {
5073                     continue;
5074                 }
5075 
5076                 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5077                     ns->features.err_rec = dw11;
5078                 }
5079             }
5080 
5081             break;
5082         }
5083 
5084         assert(ns);
5085         if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat))  {
5086             ns->features.err_rec = dw11;
5087         }
5088         break;
5089     case NVME_VOLATILE_WRITE_CACHE:
5090         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5091             ns = nvme_ns(n, i);
5092             if (!ns) {
5093                 continue;
5094             }
5095 
5096             if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
5097                 blk_flush(ns->blkconf.blk);
5098             }
5099 
5100             blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
5101         }
5102 
5103         break;
5104 
5105     case NVME_NUMBER_OF_QUEUES:
5106         if (n->qs_created) {
5107             return NVME_CMD_SEQ_ERROR | NVME_DNR;
5108         }
5109 
5110         /*
5111          * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
5112          * and NSQR.
5113          */
5114         if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
5115             return NVME_INVALID_FIELD | NVME_DNR;
5116         }
5117 
5118         trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
5119                                     ((dw11 >> 16) & 0xffff) + 1,
5120                                     n->params.max_ioqpairs,
5121                                     n->params.max_ioqpairs);
5122         req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
5123                                       ((n->params.max_ioqpairs - 1) << 16));
5124         break;
5125     case NVME_ASYNCHRONOUS_EVENT_CONF:
5126         n->features.async_config = dw11;
5127         break;
5128     case NVME_TIMESTAMP:
5129         return nvme_set_feature_timestamp(n, req);
5130     case NVME_COMMAND_SET_PROFILE:
5131         if (dw11 & 0x1ff) {
5132             trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
5133             return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
5134         }
5135         break;
5136     default:
5137         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5138     }
5139     return NVME_SUCCESS;
5140 }
5141 
5142 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
5143 {
5144     trace_pci_nvme_aer(nvme_cid(req));
5145 
5146     if (n->outstanding_aers > n->params.aerl) {
5147         trace_pci_nvme_aer_aerl_exceeded();
5148         return NVME_AER_LIMIT_EXCEEDED;
5149     }
5150 
5151     n->aer_reqs[n->outstanding_aers] = req;
5152     n->outstanding_aers++;
5153 
5154     if (!QTAILQ_EMPTY(&n->aer_queue)) {
5155         nvme_process_aers(n);
5156     }
5157 
5158     return NVME_NO_COMPLETE;
5159 }
5160 
5161 static void nvme_update_dmrsl(NvmeCtrl *n)
5162 {
5163     int nsid;
5164 
5165     for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
5166         NvmeNamespace *ns = nvme_ns(n, nsid);
5167         if (!ns) {
5168             continue;
5169         }
5170 
5171         n->dmrsl = MIN_NON_ZERO(n->dmrsl,
5172                                 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
5173     }
5174 }
5175 
5176 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
5177 {
5178     uint32_t cc = ldl_le_p(&n->bar.cc);
5179 
5180     ns->iocs = nvme_cse_iocs_none;
5181     switch (ns->csi) {
5182     case NVME_CSI_NVM:
5183         if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
5184             ns->iocs = nvme_cse_iocs_nvm;
5185         }
5186         break;
5187     case NVME_CSI_ZONED:
5188         if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
5189             ns->iocs = nvme_cse_iocs_zoned;
5190         } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
5191             ns->iocs = nvme_cse_iocs_nvm;
5192         }
5193         break;
5194     }
5195 }
5196 
5197 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
5198 {
5199     NvmeNamespace *ns;
5200     NvmeCtrl *ctrl;
5201     uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5202     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5203     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5204     uint8_t sel = dw10 & 0xf;
5205     uint16_t *nr_ids = &list[0];
5206     uint16_t *ids = &list[1];
5207     uint16_t ret;
5208     int i;
5209 
5210     trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
5211 
5212     if (!nvme_nsid_valid(n, nsid)) {
5213         return NVME_INVALID_NSID | NVME_DNR;
5214     }
5215 
5216     ns = nvme_subsys_ns(n->subsys, nsid);
5217     if (!ns) {
5218         return NVME_INVALID_FIELD | NVME_DNR;
5219     }
5220 
5221     ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
5222     if (ret) {
5223         return ret;
5224     }
5225 
5226     if (!*nr_ids) {
5227         return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5228     }
5229 
5230     *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
5231     for (i = 0; i < *nr_ids; i++) {
5232         ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
5233         if (!ctrl) {
5234             return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5235         }
5236 
5237         switch (sel) {
5238         case NVME_NS_ATTACHMENT_ATTACH:
5239             if (nvme_ns(ctrl, nsid)) {
5240                 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
5241             }
5242 
5243             if (ns->attached && !ns->params.shared) {
5244                 return NVME_NS_PRIVATE | NVME_DNR;
5245             }
5246 
5247             nvme_attach_ns(ctrl, ns);
5248             nvme_select_iocs_ns(ctrl, ns);
5249 
5250             break;
5251 
5252         case NVME_NS_ATTACHMENT_DETACH:
5253             if (!nvme_ns(ctrl, nsid)) {
5254                 return NVME_NS_NOT_ATTACHED | NVME_DNR;
5255             }
5256 
5257             ctrl->namespaces[nsid] = NULL;
5258             ns->attached--;
5259 
5260             nvme_update_dmrsl(ctrl);
5261 
5262             break;
5263 
5264         default:
5265             return NVME_INVALID_FIELD | NVME_DNR;
5266         }
5267 
5268         /*
5269          * Add namespace id to the changed namespace id list for event clearing
5270          * via Get Log Page command.
5271          */
5272         if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
5273             nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
5274                                NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
5275                                NVME_LOG_CHANGED_NSLIST);
5276         }
5277     }
5278 
5279     return NVME_SUCCESS;
5280 }
5281 
5282 typedef struct NvmeFormatAIOCB {
5283     BlockAIOCB common;
5284     BlockAIOCB *aiocb;
5285     QEMUBH *bh;
5286     NvmeRequest *req;
5287     int ret;
5288 
5289     NvmeNamespace *ns;
5290     uint32_t nsid;
5291     bool broadcast;
5292     int64_t offset;
5293 } NvmeFormatAIOCB;
5294 
5295 static void nvme_format_bh(void *opaque);
5296 
5297 static void nvme_format_cancel(BlockAIOCB *aiocb)
5298 {
5299     NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
5300 
5301     if (iocb->aiocb) {
5302         blk_aio_cancel_async(iocb->aiocb);
5303     }
5304 }
5305 
5306 static const AIOCBInfo nvme_format_aiocb_info = {
5307     .aiocb_size = sizeof(NvmeFormatAIOCB),
5308     .cancel_async = nvme_format_cancel,
5309     .get_aio_context = nvme_get_aio_context,
5310 };
5311 
5312 static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd)
5313 {
5314     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5315     uint8_t lbaf = dw10 & 0xf;
5316     uint8_t pi = (dw10 >> 5) & 0x7;
5317     uint8_t mset = (dw10 >> 4) & 0x1;
5318     uint8_t pil = (dw10 >> 8) & 0x1;
5319 
5320     trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
5321 
5322     ns->id_ns.dps = (pil << 3) | pi;
5323     ns->id_ns.flbas = lbaf | (mset << 4);
5324 
5325     nvme_ns_init_format(ns);
5326 }
5327 
5328 static void nvme_format_ns_cb(void *opaque, int ret)
5329 {
5330     NvmeFormatAIOCB *iocb = opaque;
5331     NvmeRequest *req = iocb->req;
5332     NvmeNamespace *ns = iocb->ns;
5333     int bytes;
5334 
5335     if (ret < 0) {
5336         iocb->ret = ret;
5337         goto done;
5338     }
5339 
5340     assert(ns);
5341 
5342     if (iocb->offset < ns->size) {
5343         bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
5344 
5345         iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
5346                                             bytes, BDRV_REQ_MAY_UNMAP,
5347                                             nvme_format_ns_cb, iocb);
5348 
5349         iocb->offset += bytes;
5350         return;
5351     }
5352 
5353     nvme_format_set(ns, &req->cmd);
5354     ns->status = 0x0;
5355     iocb->ns = NULL;
5356     iocb->offset = 0;
5357 
5358 done:
5359     iocb->aiocb = NULL;
5360     qemu_bh_schedule(iocb->bh);
5361 }
5362 
5363 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
5364 {
5365     if (ns->params.zoned) {
5366         return NVME_INVALID_FORMAT | NVME_DNR;
5367     }
5368 
5369     if (lbaf > ns->id_ns.nlbaf) {
5370         return NVME_INVALID_FORMAT | NVME_DNR;
5371     }
5372 
5373     if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) {
5374         return NVME_INVALID_FORMAT | NVME_DNR;
5375     }
5376 
5377     if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5378         return NVME_INVALID_FIELD | NVME_DNR;
5379     }
5380 
5381     return NVME_SUCCESS;
5382 }
5383 
5384 static void nvme_format_bh(void *opaque)
5385 {
5386     NvmeFormatAIOCB *iocb = opaque;
5387     NvmeRequest *req = iocb->req;
5388     NvmeCtrl *n = nvme_ctrl(req);
5389     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5390     uint8_t lbaf = dw10 & 0xf;
5391     uint8_t pi = (dw10 >> 5) & 0x7;
5392     uint16_t status;
5393     int i;
5394 
5395     if (iocb->ret < 0) {
5396         goto done;
5397     }
5398 
5399     if (iocb->broadcast) {
5400         for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
5401             iocb->ns = nvme_ns(n, i);
5402             if (iocb->ns) {
5403                 iocb->nsid = i;
5404                 break;
5405             }
5406         }
5407     }
5408 
5409     if (!iocb->ns) {
5410         goto done;
5411     }
5412 
5413     status = nvme_format_check(iocb->ns, lbaf, pi);
5414     if (status) {
5415         req->status = status;
5416         goto done;
5417     }
5418 
5419     iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
5420     nvme_format_ns_cb(iocb, 0);
5421     return;
5422 
5423 done:
5424     qemu_bh_delete(iocb->bh);
5425     iocb->bh = NULL;
5426 
5427     iocb->common.cb(iocb->common.opaque, iocb->ret);
5428 
5429     qemu_aio_unref(iocb);
5430 }
5431 
5432 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5433 {
5434     NvmeFormatAIOCB *iocb;
5435     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5436     uint16_t status;
5437 
5438     iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
5439 
5440     iocb->req = req;
5441     iocb->bh = qemu_bh_new(nvme_format_bh, iocb);
5442     iocb->ret = 0;
5443     iocb->ns = NULL;
5444     iocb->nsid = 0;
5445     iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
5446     iocb->offset = 0;
5447 
5448     if (!iocb->broadcast) {
5449         if (!nvme_nsid_valid(n, nsid)) {
5450             status = NVME_INVALID_NSID | NVME_DNR;
5451             goto out;
5452         }
5453 
5454         iocb->ns = nvme_ns(n, nsid);
5455         if (!iocb->ns) {
5456             status = NVME_INVALID_FIELD | NVME_DNR;
5457             goto out;
5458         }
5459     }
5460 
5461     req->aiocb = &iocb->common;
5462     qemu_bh_schedule(iocb->bh);
5463 
5464     return NVME_NO_COMPLETE;
5465 
5466 out:
5467     qemu_bh_delete(iocb->bh);
5468     iocb->bh = NULL;
5469     qemu_aio_unref(iocb);
5470     return status;
5471 }
5472 
5473 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
5474 {
5475     trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
5476                              nvme_adm_opc_str(req->cmd.opcode));
5477 
5478     if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
5479         trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
5480         return NVME_INVALID_OPCODE | NVME_DNR;
5481     }
5482 
5483     /* SGLs shall not be used for Admin commands in NVMe over PCIe */
5484     if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
5485         return NVME_INVALID_FIELD | NVME_DNR;
5486     }
5487 
5488     if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
5489         return NVME_INVALID_FIELD;
5490     }
5491 
5492     switch (req->cmd.opcode) {
5493     case NVME_ADM_CMD_DELETE_SQ:
5494         return nvme_del_sq(n, req);
5495     case NVME_ADM_CMD_CREATE_SQ:
5496         return nvme_create_sq(n, req);
5497     case NVME_ADM_CMD_GET_LOG_PAGE:
5498         return nvme_get_log(n, req);
5499     case NVME_ADM_CMD_DELETE_CQ:
5500         return nvme_del_cq(n, req);
5501     case NVME_ADM_CMD_CREATE_CQ:
5502         return nvme_create_cq(n, req);
5503     case NVME_ADM_CMD_IDENTIFY:
5504         return nvme_identify(n, req);
5505     case NVME_ADM_CMD_ABORT:
5506         return nvme_abort(n, req);
5507     case NVME_ADM_CMD_SET_FEATURES:
5508         return nvme_set_feature(n, req);
5509     case NVME_ADM_CMD_GET_FEATURES:
5510         return nvme_get_feature(n, req);
5511     case NVME_ADM_CMD_ASYNC_EV_REQ:
5512         return nvme_aer(n, req);
5513     case NVME_ADM_CMD_NS_ATTACHMENT:
5514         return nvme_ns_attachment(n, req);
5515     case NVME_ADM_CMD_FORMAT_NVM:
5516         return nvme_format(n, req);
5517     default:
5518         assert(false);
5519     }
5520 
5521     return NVME_INVALID_OPCODE | NVME_DNR;
5522 }
5523 
5524 static void nvme_process_sq(void *opaque)
5525 {
5526     NvmeSQueue *sq = opaque;
5527     NvmeCtrl *n = sq->ctrl;
5528     NvmeCQueue *cq = n->cq[sq->cqid];
5529 
5530     uint16_t status;
5531     hwaddr addr;
5532     NvmeCmd cmd;
5533     NvmeRequest *req;
5534 
5535     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
5536         addr = sq->dma_addr + sq->head * n->sqe_size;
5537         if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
5538             trace_pci_nvme_err_addr_read(addr);
5539             trace_pci_nvme_err_cfs();
5540             stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
5541             break;
5542         }
5543         nvme_inc_sq_head(sq);
5544 
5545         req = QTAILQ_FIRST(&sq->req_list);
5546         QTAILQ_REMOVE(&sq->req_list, req, entry);
5547         QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
5548         nvme_req_clear(req);
5549         req->cqe.cid = cmd.cid;
5550         memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
5551 
5552         status = sq->sqid ? nvme_io_cmd(n, req) :
5553             nvme_admin_cmd(n, req);
5554         if (status != NVME_NO_COMPLETE) {
5555             req->status = status;
5556             nvme_enqueue_req_completion(cq, req);
5557         }
5558     }
5559 }
5560 
5561 static void nvme_ctrl_reset(NvmeCtrl *n)
5562 {
5563     NvmeNamespace *ns;
5564     int i;
5565 
5566     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5567         ns = nvme_ns(n, i);
5568         if (!ns) {
5569             continue;
5570         }
5571 
5572         nvme_ns_drain(ns);
5573     }
5574 
5575     for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5576         if (n->sq[i] != NULL) {
5577             nvme_free_sq(n->sq[i], n);
5578         }
5579     }
5580     for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5581         if (n->cq[i] != NULL) {
5582             nvme_free_cq(n->cq[i], n);
5583         }
5584     }
5585 
5586     while (!QTAILQ_EMPTY(&n->aer_queue)) {
5587         NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
5588         QTAILQ_REMOVE(&n->aer_queue, event, entry);
5589         g_free(event);
5590     }
5591 
5592     n->aer_queued = 0;
5593     n->outstanding_aers = 0;
5594     n->qs_created = false;
5595 }
5596 
5597 static void nvme_ctrl_shutdown(NvmeCtrl *n)
5598 {
5599     NvmeNamespace *ns;
5600     int i;
5601 
5602     if (n->pmr.dev) {
5603         memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5604     }
5605 
5606     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5607         ns = nvme_ns(n, i);
5608         if (!ns) {
5609             continue;
5610         }
5611 
5612         nvme_ns_shutdown(ns);
5613     }
5614 }
5615 
5616 static void nvme_select_iocs(NvmeCtrl *n)
5617 {
5618     NvmeNamespace *ns;
5619     int i;
5620 
5621     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5622         ns = nvme_ns(n, i);
5623         if (!ns) {
5624             continue;
5625         }
5626 
5627         nvme_select_iocs_ns(n, ns);
5628     }
5629 }
5630 
5631 static int nvme_start_ctrl(NvmeCtrl *n)
5632 {
5633     uint64_t cap = ldq_le_p(&n->bar.cap);
5634     uint32_t cc = ldl_le_p(&n->bar.cc);
5635     uint32_t aqa = ldl_le_p(&n->bar.aqa);
5636     uint64_t asq = ldq_le_p(&n->bar.asq);
5637     uint64_t acq = ldq_le_p(&n->bar.acq);
5638     uint32_t page_bits = NVME_CC_MPS(cc) + 12;
5639     uint32_t page_size = 1 << page_bits;
5640 
5641     if (unlikely(n->cq[0])) {
5642         trace_pci_nvme_err_startfail_cq();
5643         return -1;
5644     }
5645     if (unlikely(n->sq[0])) {
5646         trace_pci_nvme_err_startfail_sq();
5647         return -1;
5648     }
5649     if (unlikely(asq & (page_size - 1))) {
5650         trace_pci_nvme_err_startfail_asq_misaligned(asq);
5651         return -1;
5652     }
5653     if (unlikely(acq & (page_size - 1))) {
5654         trace_pci_nvme_err_startfail_acq_misaligned(acq);
5655         return -1;
5656     }
5657     if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
5658         trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
5659         return -1;
5660     }
5661     if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
5662         trace_pci_nvme_err_startfail_page_too_small(
5663                     NVME_CC_MPS(cc),
5664                     NVME_CAP_MPSMIN(cap));
5665         return -1;
5666     }
5667     if (unlikely(NVME_CC_MPS(cc) >
5668                  NVME_CAP_MPSMAX(cap))) {
5669         trace_pci_nvme_err_startfail_page_too_large(
5670                     NVME_CC_MPS(cc),
5671                     NVME_CAP_MPSMAX(cap));
5672         return -1;
5673     }
5674     if (unlikely(NVME_CC_IOCQES(cc) <
5675                  NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
5676         trace_pci_nvme_err_startfail_cqent_too_small(
5677                     NVME_CC_IOCQES(cc),
5678                     NVME_CTRL_CQES_MIN(cap));
5679         return -1;
5680     }
5681     if (unlikely(NVME_CC_IOCQES(cc) >
5682                  NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
5683         trace_pci_nvme_err_startfail_cqent_too_large(
5684                     NVME_CC_IOCQES(cc),
5685                     NVME_CTRL_CQES_MAX(cap));
5686         return -1;
5687     }
5688     if (unlikely(NVME_CC_IOSQES(cc) <
5689                  NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
5690         trace_pci_nvme_err_startfail_sqent_too_small(
5691                     NVME_CC_IOSQES(cc),
5692                     NVME_CTRL_SQES_MIN(cap));
5693         return -1;
5694     }
5695     if (unlikely(NVME_CC_IOSQES(cc) >
5696                  NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
5697         trace_pci_nvme_err_startfail_sqent_too_large(
5698                     NVME_CC_IOSQES(cc),
5699                     NVME_CTRL_SQES_MAX(cap));
5700         return -1;
5701     }
5702     if (unlikely(!NVME_AQA_ASQS(aqa))) {
5703         trace_pci_nvme_err_startfail_asqent_sz_zero();
5704         return -1;
5705     }
5706     if (unlikely(!NVME_AQA_ACQS(aqa))) {
5707         trace_pci_nvme_err_startfail_acqent_sz_zero();
5708         return -1;
5709     }
5710 
5711     n->page_bits = page_bits;
5712     n->page_size = page_size;
5713     n->max_prp_ents = n->page_size / sizeof(uint64_t);
5714     n->cqe_size = 1 << NVME_CC_IOCQES(cc);
5715     n->sqe_size = 1 << NVME_CC_IOSQES(cc);
5716     nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
5717     nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
5718 
5719     nvme_set_timestamp(n, 0ULL);
5720 
5721     QTAILQ_INIT(&n->aer_queue);
5722 
5723     nvme_select_iocs(n);
5724 
5725     return 0;
5726 }
5727 
5728 static void nvme_cmb_enable_regs(NvmeCtrl *n)
5729 {
5730     uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
5731     uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
5732 
5733     NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
5734     NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
5735     NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
5736     stl_le_p(&n->bar.cmbloc, cmbloc);
5737 
5738     NVME_CMBSZ_SET_SQS(cmbsz, 1);
5739     NVME_CMBSZ_SET_CQS(cmbsz, 0);
5740     NVME_CMBSZ_SET_LISTS(cmbsz, 1);
5741     NVME_CMBSZ_SET_RDS(cmbsz, 1);
5742     NVME_CMBSZ_SET_WDS(cmbsz, 1);
5743     NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
5744     NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
5745     stl_le_p(&n->bar.cmbsz, cmbsz);
5746 }
5747 
5748 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
5749                            unsigned size)
5750 {
5751     uint64_t cap = ldq_le_p(&n->bar.cap);
5752     uint32_t cc = ldl_le_p(&n->bar.cc);
5753     uint32_t intms = ldl_le_p(&n->bar.intms);
5754     uint32_t csts = ldl_le_p(&n->bar.csts);
5755     uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
5756 
5757     if (unlikely(offset & (sizeof(uint32_t) - 1))) {
5758         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
5759                        "MMIO write not 32-bit aligned,"
5760                        " offset=0x%"PRIx64"", offset);
5761         /* should be ignored, fall through for now */
5762     }
5763 
5764     if (unlikely(size < sizeof(uint32_t))) {
5765         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
5766                        "MMIO write smaller than 32-bits,"
5767                        " offset=0x%"PRIx64", size=%u",
5768                        offset, size);
5769         /* should be ignored, fall through for now */
5770     }
5771 
5772     switch (offset) {
5773     case NVME_REG_INTMS:
5774         if (unlikely(msix_enabled(&(n->parent_obj)))) {
5775             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5776                            "undefined access to interrupt mask set"
5777                            " when MSI-X is enabled");
5778             /* should be ignored, fall through for now */
5779         }
5780         intms |= data;
5781         stl_le_p(&n->bar.intms, intms);
5782         n->bar.intmc = n->bar.intms;
5783         trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
5784         nvme_irq_check(n);
5785         break;
5786     case NVME_REG_INTMC:
5787         if (unlikely(msix_enabled(&(n->parent_obj)))) {
5788             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5789                            "undefined access to interrupt mask clr"
5790                            " when MSI-X is enabled");
5791             /* should be ignored, fall through for now */
5792         }
5793         intms &= ~data;
5794         stl_le_p(&n->bar.intms, intms);
5795         n->bar.intmc = n->bar.intms;
5796         trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
5797         nvme_irq_check(n);
5798         break;
5799     case NVME_REG_CC:
5800         trace_pci_nvme_mmio_cfg(data & 0xffffffff);
5801 
5802         /* Windows first sends data, then sends enable bit */
5803         if (!NVME_CC_EN(data) && !NVME_CC_EN(cc) &&
5804             !NVME_CC_SHN(data) && !NVME_CC_SHN(cc))
5805         {
5806             cc = data;
5807         }
5808 
5809         if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
5810             cc = data;
5811 
5812             /* flush CC since nvme_start_ctrl() needs the value */
5813             stl_le_p(&n->bar.cc, cc);
5814             if (unlikely(nvme_start_ctrl(n))) {
5815                 trace_pci_nvme_err_startfail();
5816                 csts = NVME_CSTS_FAILED;
5817             } else {
5818                 trace_pci_nvme_mmio_start_success();
5819                 csts = NVME_CSTS_READY;
5820             }
5821         } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
5822             trace_pci_nvme_mmio_stopped();
5823             nvme_ctrl_reset(n);
5824             cc = 0;
5825             csts &= ~NVME_CSTS_READY;
5826         }
5827 
5828         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
5829             trace_pci_nvme_mmio_shutdown_set();
5830             nvme_ctrl_shutdown(n);
5831             cc = data;
5832             csts |= NVME_CSTS_SHST_COMPLETE;
5833         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
5834             trace_pci_nvme_mmio_shutdown_cleared();
5835             csts &= ~NVME_CSTS_SHST_COMPLETE;
5836             cc = data;
5837         }
5838 
5839         stl_le_p(&n->bar.cc, cc);
5840         stl_le_p(&n->bar.csts, csts);
5841 
5842         break;
5843     case NVME_REG_CSTS:
5844         if (data & (1 << 4)) {
5845             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
5846                            "attempted to W1C CSTS.NSSRO"
5847                            " but CAP.NSSRS is zero (not supported)");
5848         } else if (data != 0) {
5849             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
5850                            "attempted to set a read only bit"
5851                            " of controller status");
5852         }
5853         break;
5854     case NVME_REG_NSSR:
5855         if (data == 0x4e564d65) {
5856             trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
5857         } else {
5858             /* The spec says that writes of other values have no effect */
5859             return;
5860         }
5861         break;
5862     case NVME_REG_AQA:
5863         stl_le_p(&n->bar.aqa, data);
5864         trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
5865         break;
5866     case NVME_REG_ASQ:
5867         stn_le_p(&n->bar.asq, size, data);
5868         trace_pci_nvme_mmio_asqaddr(data);
5869         break;
5870     case NVME_REG_ASQ + 4:
5871         stl_le_p((uint8_t *)&n->bar.asq + 4, data);
5872         trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
5873         break;
5874     case NVME_REG_ACQ:
5875         trace_pci_nvme_mmio_acqaddr(data);
5876         stn_le_p(&n->bar.acq, size, data);
5877         break;
5878     case NVME_REG_ACQ + 4:
5879         stl_le_p((uint8_t *)&n->bar.acq + 4, data);
5880         trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
5881         break;
5882     case NVME_REG_CMBLOC:
5883         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
5884                        "invalid write to reserved CMBLOC"
5885                        " when CMBSZ is zero, ignored");
5886         return;
5887     case NVME_REG_CMBSZ:
5888         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
5889                        "invalid write to read only CMBSZ, ignored");
5890         return;
5891     case NVME_REG_CMBMSC:
5892         if (!NVME_CAP_CMBS(cap)) {
5893             return;
5894         }
5895 
5896         stn_le_p(&n->bar.cmbmsc, size, data);
5897         n->cmb.cmse = false;
5898 
5899         if (NVME_CMBMSC_CRE(data)) {
5900             nvme_cmb_enable_regs(n);
5901 
5902             if (NVME_CMBMSC_CMSE(data)) {
5903                 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
5904                 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
5905                 if (cba + int128_get64(n->cmb.mem.size) < cba) {
5906                     uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
5907                     NVME_CMBSTS_SET_CBAI(cmbsts, 1);
5908                     stl_le_p(&n->bar.cmbsts, cmbsts);
5909                     return;
5910                 }
5911 
5912                 n->cmb.cba = cba;
5913                 n->cmb.cmse = true;
5914             }
5915         } else {
5916             n->bar.cmbsz = 0;
5917             n->bar.cmbloc = 0;
5918         }
5919 
5920         return;
5921     case NVME_REG_CMBMSC + 4:
5922         stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
5923         return;
5924 
5925     case NVME_REG_PMRCAP:
5926         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
5927                        "invalid write to PMRCAP register, ignored");
5928         return;
5929     case NVME_REG_PMRCTL:
5930         if (!NVME_CAP_PMRS(cap)) {
5931             return;
5932         }
5933 
5934         stl_le_p(&n->bar.pmrctl, data);
5935         if (NVME_PMRCTL_EN(data)) {
5936             memory_region_set_enabled(&n->pmr.dev->mr, true);
5937             pmrsts = 0;
5938         } else {
5939             memory_region_set_enabled(&n->pmr.dev->mr, false);
5940             NVME_PMRSTS_SET_NRDY(pmrsts, 1);
5941             n->pmr.cmse = false;
5942         }
5943         stl_le_p(&n->bar.pmrsts, pmrsts);
5944         return;
5945     case NVME_REG_PMRSTS:
5946         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
5947                        "invalid write to PMRSTS register, ignored");
5948         return;
5949     case NVME_REG_PMREBS:
5950         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
5951                        "invalid write to PMREBS register, ignored");
5952         return;
5953     case NVME_REG_PMRSWTP:
5954         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
5955                        "invalid write to PMRSWTP register, ignored");
5956         return;
5957     case NVME_REG_PMRMSCL:
5958         if (!NVME_CAP_PMRS(cap)) {
5959             return;
5960         }
5961 
5962         stl_le_p(&n->bar.pmrmscl, data);
5963         n->pmr.cmse = false;
5964 
5965         if (NVME_PMRMSCL_CMSE(data)) {
5966             uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
5967             hwaddr cba = pmrmscu << 32 |
5968                 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
5969             if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
5970                 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
5971                 stl_le_p(&n->bar.pmrsts, pmrsts);
5972                 return;
5973             }
5974 
5975             n->pmr.cmse = true;
5976             n->pmr.cba = cba;
5977         }
5978 
5979         return;
5980     case NVME_REG_PMRMSCU:
5981         if (!NVME_CAP_PMRS(cap)) {
5982             return;
5983         }
5984 
5985         stl_le_p(&n->bar.pmrmscu, data);
5986         return;
5987     default:
5988         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
5989                        "invalid MMIO write,"
5990                        " offset=0x%"PRIx64", data=%"PRIx64"",
5991                        offset, data);
5992         break;
5993     }
5994 }
5995 
5996 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
5997 {
5998     NvmeCtrl *n = (NvmeCtrl *)opaque;
5999     uint8_t *ptr = (uint8_t *)&n->bar;
6000 
6001     trace_pci_nvme_mmio_read(addr, size);
6002 
6003     if (unlikely(addr & (sizeof(uint32_t) - 1))) {
6004         NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
6005                        "MMIO read not 32-bit aligned,"
6006                        " offset=0x%"PRIx64"", addr);
6007         /* should RAZ, fall through for now */
6008     } else if (unlikely(size < sizeof(uint32_t))) {
6009         NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
6010                        "MMIO read smaller than 32-bits,"
6011                        " offset=0x%"PRIx64"", addr);
6012         /* should RAZ, fall through for now */
6013     }
6014 
6015     if (addr > sizeof(n->bar) - size) {
6016         NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
6017                        "MMIO read beyond last register,"
6018                        " offset=0x%"PRIx64", returning 0", addr);
6019 
6020         return 0;
6021     }
6022 
6023     /*
6024      * When PMRWBM bit 1 is set then read from
6025      * from PMRSTS should ensure prior writes
6026      * made it to persistent media
6027      */
6028     if (addr == NVME_REG_PMRSTS &&
6029         (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
6030         memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
6031     }
6032 
6033     return ldn_le_p(ptr + addr, size);
6034 }
6035 
6036 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
6037 {
6038     uint32_t qid;
6039 
6040     if (unlikely(addr & ((1 << 2) - 1))) {
6041         NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
6042                        "doorbell write not 32-bit aligned,"
6043                        " offset=0x%"PRIx64", ignoring", addr);
6044         return;
6045     }
6046 
6047     if (((addr - 0x1000) >> 2) & 1) {
6048         /* Completion queue doorbell write */
6049 
6050         uint16_t new_head = val & 0xffff;
6051         int start_sqs;
6052         NvmeCQueue *cq;
6053 
6054         qid = (addr - (0x1000 + (1 << 2))) >> 3;
6055         if (unlikely(nvme_check_cqid(n, qid))) {
6056             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
6057                            "completion queue doorbell write"
6058                            " for nonexistent queue,"
6059                            " sqid=%"PRIu32", ignoring", qid);
6060 
6061             /*
6062              * NVM Express v1.3d, Section 4.1 state: "If host software writes
6063              * an invalid value to the Submission Queue Tail Doorbell or
6064              * Completion Queue Head Doorbell regiter and an Asynchronous Event
6065              * Request command is outstanding, then an asynchronous event is
6066              * posted to the Admin Completion Queue with a status code of
6067              * Invalid Doorbell Write Value."
6068              *
6069              * Also note that the spec includes the "Invalid Doorbell Register"
6070              * status code, but nowhere does it specify when to use it.
6071              * However, it seems reasonable to use it here in a similar
6072              * fashion.
6073              */
6074             if (n->outstanding_aers) {
6075                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6076                                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6077                                    NVME_LOG_ERROR_INFO);
6078             }
6079 
6080             return;
6081         }
6082 
6083         cq = n->cq[qid];
6084         if (unlikely(new_head >= cq->size)) {
6085             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
6086                            "completion queue doorbell write value"
6087                            " beyond queue size, sqid=%"PRIu32","
6088                            " new_head=%"PRIu16", ignoring",
6089                            qid, new_head);
6090 
6091             if (n->outstanding_aers) {
6092                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6093                                    NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6094                                    NVME_LOG_ERROR_INFO);
6095             }
6096 
6097             return;
6098         }
6099 
6100         trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
6101 
6102         start_sqs = nvme_cq_full(cq) ? 1 : 0;
6103         cq->head = new_head;
6104         if (start_sqs) {
6105             NvmeSQueue *sq;
6106             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
6107                 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6108             }
6109             timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6110         }
6111 
6112         if (cq->tail == cq->head) {
6113             if (cq->irq_enabled) {
6114                 n->cq_pending--;
6115             }
6116 
6117             nvme_irq_deassert(n, cq);
6118         }
6119     } else {
6120         /* Submission queue doorbell write */
6121 
6122         uint16_t new_tail = val & 0xffff;
6123         NvmeSQueue *sq;
6124 
6125         qid = (addr - 0x1000) >> 3;
6126         if (unlikely(nvme_check_sqid(n, qid))) {
6127             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
6128                            "submission queue doorbell write"
6129                            " for nonexistent queue,"
6130                            " sqid=%"PRIu32", ignoring", qid);
6131 
6132             if (n->outstanding_aers) {
6133                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6134                                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6135                                    NVME_LOG_ERROR_INFO);
6136             }
6137 
6138             return;
6139         }
6140 
6141         sq = n->sq[qid];
6142         if (unlikely(new_tail >= sq->size)) {
6143             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
6144                            "submission queue doorbell write value"
6145                            " beyond queue size, sqid=%"PRIu32","
6146                            " new_tail=%"PRIu16", ignoring",
6147                            qid, new_tail);
6148 
6149             if (n->outstanding_aers) {
6150                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6151                                    NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6152                                    NVME_LOG_ERROR_INFO);
6153             }
6154 
6155             return;
6156         }
6157 
6158         trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
6159 
6160         sq->tail = new_tail;
6161         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6162     }
6163 }
6164 
6165 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
6166                             unsigned size)
6167 {
6168     NvmeCtrl *n = (NvmeCtrl *)opaque;
6169 
6170     trace_pci_nvme_mmio_write(addr, data, size);
6171 
6172     if (addr < sizeof(n->bar)) {
6173         nvme_write_bar(n, addr, data, size);
6174     } else {
6175         nvme_process_db(n, addr, data);
6176     }
6177 }
6178 
6179 static const MemoryRegionOps nvme_mmio_ops = {
6180     .read = nvme_mmio_read,
6181     .write = nvme_mmio_write,
6182     .endianness = DEVICE_LITTLE_ENDIAN,
6183     .impl = {
6184         .min_access_size = 2,
6185         .max_access_size = 8,
6186     },
6187 };
6188 
6189 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
6190                            unsigned size)
6191 {
6192     NvmeCtrl *n = (NvmeCtrl *)opaque;
6193     stn_le_p(&n->cmb.buf[addr], size, data);
6194 }
6195 
6196 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
6197 {
6198     NvmeCtrl *n = (NvmeCtrl *)opaque;
6199     return ldn_le_p(&n->cmb.buf[addr], size);
6200 }
6201 
6202 static const MemoryRegionOps nvme_cmb_ops = {
6203     .read = nvme_cmb_read,
6204     .write = nvme_cmb_write,
6205     .endianness = DEVICE_LITTLE_ENDIAN,
6206     .impl = {
6207         .min_access_size = 1,
6208         .max_access_size = 8,
6209     },
6210 };
6211 
6212 static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
6213 {
6214     NvmeParams *params = &n->params;
6215 
6216     if (params->num_queues) {
6217         warn_report("num_queues is deprecated; please use max_ioqpairs "
6218                     "instead");
6219 
6220         params->max_ioqpairs = params->num_queues - 1;
6221     }
6222 
6223     if (n->namespace.blkconf.blk && n->subsys) {
6224         error_setg(errp, "subsystem support is unavailable with legacy "
6225                    "namespace ('drive' property)");
6226         return;
6227     }
6228 
6229     if (params->max_ioqpairs < 1 ||
6230         params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
6231         error_setg(errp, "max_ioqpairs must be between 1 and %d",
6232                    NVME_MAX_IOQPAIRS);
6233         return;
6234     }
6235 
6236     if (params->msix_qsize < 1 ||
6237         params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
6238         error_setg(errp, "msix_qsize must be between 1 and %d",
6239                    PCI_MSIX_FLAGS_QSIZE + 1);
6240         return;
6241     }
6242 
6243     if (!params->serial) {
6244         error_setg(errp, "serial property not set");
6245         return;
6246     }
6247 
6248     if (n->pmr.dev) {
6249         if (host_memory_backend_is_mapped(n->pmr.dev)) {
6250             error_setg(errp, "can't use already busy memdev: %s",
6251                        object_get_canonical_path_component(OBJECT(n->pmr.dev)));
6252             return;
6253         }
6254 
6255         if (!is_power_of_2(n->pmr.dev->size)) {
6256             error_setg(errp, "pmr backend size needs to be power of 2 in size");
6257             return;
6258         }
6259 
6260         host_memory_backend_set_mapped(n->pmr.dev, true);
6261     }
6262 
6263     if (n->params.zasl > n->params.mdts) {
6264         error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
6265                    "than or equal to mdts (Maximum Data Transfer Size)");
6266         return;
6267     }
6268 
6269     if (!n->params.vsl) {
6270         error_setg(errp, "vsl must be non-zero");
6271         return;
6272     }
6273 }
6274 
6275 static void nvme_init_state(NvmeCtrl *n)
6276 {
6277     /* add one to max_ioqpairs to account for the admin queue pair */
6278     n->reg_size = pow2ceil(sizeof(NvmeBar) +
6279                            2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
6280     n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
6281     n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
6282     n->temperature = NVME_TEMPERATURE;
6283     n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
6284     n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6285     n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
6286 }
6287 
6288 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
6289 {
6290     uint64_t cmb_size = n->params.cmb_size_mb * MiB;
6291     uint64_t cap = ldq_le_p(&n->bar.cap);
6292 
6293     n->cmb.buf = g_malloc0(cmb_size);
6294     memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
6295                           "nvme-cmb", cmb_size);
6296     pci_register_bar(pci_dev, NVME_CMB_BIR,
6297                      PCI_BASE_ADDRESS_SPACE_MEMORY |
6298                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
6299                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
6300 
6301     NVME_CAP_SET_CMBS(cap, 1);
6302     stq_le_p(&n->bar.cap, cap);
6303 
6304     if (n->params.legacy_cmb) {
6305         nvme_cmb_enable_regs(n);
6306         n->cmb.cmse = true;
6307     }
6308 }
6309 
6310 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
6311 {
6312     uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
6313 
6314     NVME_PMRCAP_SET_RDS(pmrcap, 1);
6315     NVME_PMRCAP_SET_WDS(pmrcap, 1);
6316     NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
6317     /* Turn on bit 1 support */
6318     NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
6319     NVME_PMRCAP_SET_CMSS(pmrcap, 1);
6320     stl_le_p(&n->bar.pmrcap, pmrcap);
6321 
6322     pci_register_bar(pci_dev, NVME_PMR_BIR,
6323                      PCI_BASE_ADDRESS_SPACE_MEMORY |
6324                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
6325                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
6326 
6327     memory_region_set_enabled(&n->pmr.dev->mr, false);
6328 }
6329 
6330 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
6331 {
6332     uint8_t *pci_conf = pci_dev->config;
6333     uint64_t bar_size, msix_table_size, msix_pba_size;
6334     unsigned msix_table_offset, msix_pba_offset;
6335     int ret;
6336 
6337     Error *err = NULL;
6338 
6339     pci_conf[PCI_INTERRUPT_PIN] = 1;
6340     pci_config_set_prog_interface(pci_conf, 0x2);
6341 
6342     if (n->params.use_intel_id) {
6343         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
6344         pci_config_set_device_id(pci_conf, 0x5845);
6345     } else {
6346         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
6347         pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
6348     }
6349 
6350     pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
6351     pcie_endpoint_cap_init(pci_dev, 0x80);
6352 
6353     bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
6354     msix_table_offset = bar_size;
6355     msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
6356 
6357     bar_size += msix_table_size;
6358     bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
6359     msix_pba_offset = bar_size;
6360     msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
6361 
6362     bar_size += msix_pba_size;
6363     bar_size = pow2ceil(bar_size);
6364 
6365     memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
6366     memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
6367                           n->reg_size);
6368     memory_region_add_subregion(&n->bar0, 0, &n->iomem);
6369 
6370     pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
6371                      PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
6372     ret = msix_init(pci_dev, n->params.msix_qsize,
6373                     &n->bar0, 0, msix_table_offset,
6374                     &n->bar0, 0, msix_pba_offset, 0, &err);
6375     if (ret < 0) {
6376         if (ret == -ENOTSUP) {
6377             warn_report_err(err);
6378         } else {
6379             error_propagate(errp, err);
6380             return ret;
6381         }
6382     }
6383 
6384     if (n->params.cmb_size_mb) {
6385         nvme_init_cmb(n, pci_dev);
6386     }
6387 
6388     if (n->pmr.dev) {
6389         nvme_init_pmr(n, pci_dev);
6390     }
6391 
6392     return 0;
6393 }
6394 
6395 static void nvme_init_subnqn(NvmeCtrl *n)
6396 {
6397     NvmeSubsystem *subsys = n->subsys;
6398     NvmeIdCtrl *id = &n->id_ctrl;
6399 
6400     if (!subsys) {
6401         snprintf((char *)id->subnqn, sizeof(id->subnqn),
6402                  "nqn.2019-08.org.qemu:%s", n->params.serial);
6403     } else {
6404         pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
6405     }
6406 }
6407 
6408 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
6409 {
6410     NvmeIdCtrl *id = &n->id_ctrl;
6411     uint8_t *pci_conf = pci_dev->config;
6412     uint64_t cap = ldq_le_p(&n->bar.cap);
6413 
6414     id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
6415     id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
6416     strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
6417     strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
6418     strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
6419 
6420     id->cntlid = cpu_to_le16(n->cntlid);
6421 
6422     id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
6423 
6424     id->rab = 6;
6425 
6426     if (n->params.use_intel_id) {
6427         id->ieee[0] = 0xb3;
6428         id->ieee[1] = 0x02;
6429         id->ieee[2] = 0x00;
6430     } else {
6431         id->ieee[0] = 0x00;
6432         id->ieee[1] = 0x54;
6433         id->ieee[2] = 0x52;
6434     }
6435 
6436     id->mdts = n->params.mdts;
6437     id->ver = cpu_to_le32(NVME_SPEC_VER);
6438     id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
6439     id->cntrltype = 0x1;
6440 
6441     /*
6442      * Because the controller always completes the Abort command immediately,
6443      * there can never be more than one concurrently executing Abort command,
6444      * so this value is never used for anything. Note that there can easily be
6445      * many Abort commands in the queues, but they are not considered
6446      * "executing" until processed by nvme_abort.
6447      *
6448      * The specification recommends a value of 3 for Abort Command Limit (four
6449      * concurrently outstanding Abort commands), so lets use that though it is
6450      * inconsequential.
6451      */
6452     id->acl = 3;
6453     id->aerl = n->params.aerl;
6454     id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
6455     id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
6456 
6457     /* recommended default value (~70 C) */
6458     id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
6459     id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
6460 
6461     id->sqes = (0x6 << 4) | 0x6;
6462     id->cqes = (0x4 << 4) | 0x4;
6463     id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
6464     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
6465                            NVME_ONCS_FEATURES | NVME_ONCS_DSM |
6466                            NVME_ONCS_COMPARE | NVME_ONCS_COPY);
6467 
6468     /*
6469      * NOTE: If this device ever supports a command set that does NOT use 0x0
6470      * as a Flush-equivalent operation, support for the broadcast NSID in Flush
6471      * should probably be removed.
6472      *
6473      * See comment in nvme_io_cmd.
6474      */
6475     id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
6476 
6477     id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
6478     id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
6479                            NVME_CTRL_SGLS_BITBUCKET);
6480 
6481     nvme_init_subnqn(n);
6482 
6483     id->psd[0].mp = cpu_to_le16(0x9c4);
6484     id->psd[0].enlat = cpu_to_le32(0x10);
6485     id->psd[0].exlat = cpu_to_le32(0x4);
6486 
6487     if (n->subsys) {
6488         id->cmic |= NVME_CMIC_MULTI_CTRL;
6489     }
6490 
6491     NVME_CAP_SET_MQES(cap, 0x7ff);
6492     NVME_CAP_SET_CQR(cap, 1);
6493     NVME_CAP_SET_TO(cap, 0xf);
6494     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
6495     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
6496     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
6497     NVME_CAP_SET_MPSMAX(cap, 4);
6498     NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
6499     NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
6500     stq_le_p(&n->bar.cap, cap);
6501 
6502     stl_le_p(&n->bar.vs, NVME_SPEC_VER);
6503     n->bar.intmc = n->bar.intms = 0;
6504 }
6505 
6506 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
6507 {
6508     int cntlid;
6509 
6510     if (!n->subsys) {
6511         return 0;
6512     }
6513 
6514     cntlid = nvme_subsys_register_ctrl(n, errp);
6515     if (cntlid < 0) {
6516         return -1;
6517     }
6518 
6519     n->cntlid = cntlid;
6520 
6521     return 0;
6522 }
6523 
6524 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6525 {
6526     uint32_t nsid = ns->params.nsid;
6527     assert(nsid && nsid <= NVME_MAX_NAMESPACES);
6528 
6529     n->namespaces[nsid] = ns;
6530     ns->attached++;
6531 
6532     n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6533                             BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6534 }
6535 
6536 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
6537 {
6538     NvmeCtrl *n = NVME(pci_dev);
6539     NvmeNamespace *ns;
6540     Error *local_err = NULL;
6541 
6542     nvme_check_constraints(n, &local_err);
6543     if (local_err) {
6544         error_propagate(errp, local_err);
6545         return;
6546     }
6547 
6548     qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
6549               &pci_dev->qdev, n->parent_obj.qdev.id);
6550 
6551     nvme_init_state(n);
6552     if (nvme_init_pci(n, pci_dev, errp)) {
6553         return;
6554     }
6555 
6556     if (nvme_init_subsys(n, errp)) {
6557         error_propagate(errp, local_err);
6558         return;
6559     }
6560     nvme_init_ctrl(n, pci_dev);
6561 
6562     /* setup a namespace if the controller drive property was given */
6563     if (n->namespace.blkconf.blk) {
6564         ns = &n->namespace;
6565         ns->params.nsid = 1;
6566 
6567         if (nvme_ns_setup(ns, errp)) {
6568             return;
6569         }
6570 
6571         nvme_attach_ns(n, ns);
6572     }
6573 }
6574 
6575 static void nvme_exit(PCIDevice *pci_dev)
6576 {
6577     NvmeCtrl *n = NVME(pci_dev);
6578     NvmeNamespace *ns;
6579     int i;
6580 
6581     nvme_ctrl_reset(n);
6582 
6583     if (n->subsys) {
6584         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6585             ns = nvme_ns(n, i);
6586             if (ns) {
6587                 ns->attached--;
6588             }
6589         }
6590 
6591         nvme_subsys_unregister_ctrl(n->subsys, n);
6592     }
6593 
6594     g_free(n->cq);
6595     g_free(n->sq);
6596     g_free(n->aer_reqs);
6597 
6598     if (n->params.cmb_size_mb) {
6599         g_free(n->cmb.buf);
6600     }
6601 
6602     if (n->pmr.dev) {
6603         host_memory_backend_set_mapped(n->pmr.dev, false);
6604     }
6605     msix_uninit(pci_dev, &n->bar0, &n->bar0);
6606     memory_region_del_subregion(&n->bar0, &n->iomem);
6607 }
6608 
6609 static Property nvme_props[] = {
6610     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
6611     DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
6612                      HostMemoryBackend *),
6613     DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
6614                      NvmeSubsystem *),
6615     DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
6616     DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
6617     DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
6618     DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6619     DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
6620     DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
6621     DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
6622     DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
6623     DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
6624     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
6625     DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
6626     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
6627     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
6628                      params.auto_transition_zones, true),
6629     DEFINE_PROP_END_OF_LIST(),
6630 };
6631 
6632 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
6633                                    void *opaque, Error **errp)
6634 {
6635     NvmeCtrl *n = NVME(obj);
6636     uint8_t value = n->smart_critical_warning;
6637 
6638     visit_type_uint8(v, name, &value, errp);
6639 }
6640 
6641 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
6642                                    void *opaque, Error **errp)
6643 {
6644     NvmeCtrl *n = NVME(obj);
6645     uint8_t value, old_value, cap = 0, index, event;
6646 
6647     if (!visit_type_uint8(v, name, &value, errp)) {
6648         return;
6649     }
6650 
6651     cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
6652           | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
6653     if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
6654         cap |= NVME_SMART_PMR_UNRELIABLE;
6655     }
6656 
6657     if ((value & cap) != value) {
6658         error_setg(errp, "unsupported smart critical warning bits: 0x%x",
6659                    value & ~cap);
6660         return;
6661     }
6662 
6663     old_value = n->smart_critical_warning;
6664     n->smart_critical_warning = value;
6665 
6666     /* only inject new bits of smart critical warning */
6667     for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
6668         event = 1 << index;
6669         if (value & ~old_value & event)
6670             nvme_smart_event(n, event);
6671     }
6672 }
6673 
6674 static const VMStateDescription nvme_vmstate = {
6675     .name = "nvme",
6676     .unmigratable = 1,
6677 };
6678 
6679 static void nvme_class_init(ObjectClass *oc, void *data)
6680 {
6681     DeviceClass *dc = DEVICE_CLASS(oc);
6682     PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
6683 
6684     pc->realize = nvme_realize;
6685     pc->exit = nvme_exit;
6686     pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
6687     pc->revision = 2;
6688 
6689     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
6690     dc->desc = "Non-Volatile Memory Express";
6691     device_class_set_props(dc, nvme_props);
6692     dc->vmsd = &nvme_vmstate;
6693 }
6694 
6695 static void nvme_instance_init(Object *obj)
6696 {
6697     NvmeCtrl *n = NVME(obj);
6698 
6699     device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
6700                                   "bootindex", "/namespace@1,0",
6701                                   DEVICE(obj));
6702 
6703     object_property_add(obj, "smart_critical_warning", "uint8",
6704                         nvme_get_smart_warning,
6705                         nvme_set_smart_warning, NULL, NULL);
6706 }
6707 
6708 static const TypeInfo nvme_info = {
6709     .name          = TYPE_NVME,
6710     .parent        = TYPE_PCI_DEVICE,
6711     .instance_size = sizeof(NvmeCtrl),
6712     .instance_init = nvme_instance_init,
6713     .class_init    = nvme_class_init,
6714     .interfaces = (InterfaceInfo[]) {
6715         { INTERFACE_PCIE_DEVICE },
6716         { }
6717     },
6718 };
6719 
6720 static const TypeInfo nvme_bus_info = {
6721     .name = TYPE_NVME_BUS,
6722     .parent = TYPE_BUS,
6723     .instance_size = sizeof(NvmeBus),
6724 };
6725 
6726 static void nvme_register_types(void)
6727 {
6728     type_register_static(&nvme_info);
6729     type_register_static(&nvme_bus_info);
6730 }
6731 
6732 type_init(nvme_register_types)
6733