xref: /openbmc/qemu/hw/nvme/ctrl.c (revision 3276dde4f262588f3645f2adbc84d07cb6981d3e)
1 /*
2  * QEMU NVM Express Controller
3  *
4  * Copyright (c) 2012, Intel Corporation
5  *
6  * Written by Keith Busch <keith.busch@intel.com>
7  *
8  * This code is licensed under the GNU GPL v2 or later.
9  */
10 
11 /**
12  * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13  *
14  *  https://nvmexpress.org/developers/nvme-specification/
15  *
16  *
17  * Notes on coding style
18  * ---------------------
19  * While QEMU coding style prefers lowercase hexadecimals in constants, the
20  * NVMe subsystem use thes format from the NVMe specifications in the comments
21  * (i.e. 'h' suffix instead of '0x' prefix).
22  *
23  * Usage
24  * -----
25  * See docs/system/nvme.rst for extensive documentation.
26  *
27  * Add options:
28  *      -drive file=<file>,if=none,id=<drive_id>
29  *      -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30  *      -device nvme,serial=<serial>,id=<bus_name>, \
31  *              cmb_size_mb=<cmb_size_mb[optional]>, \
32  *              [pmrdev=<mem_backend_file_id>,] \
33  *              max_ioqpairs=<N[optional]>, \
34  *              aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35  *              mdts=<N[optional]>,vsl=<N[optional]>, \
36  *              zoned.zasl=<N[optional]>, \
37  *              zoned.auto_transition=<on|off[optional]>, \
38  *              subsys=<subsys_id>
39  *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
40  *              zoned=<true|false[optional]>, \
41  *              subsys=<subsys_id>,detached=<true|false[optional]>
42  *
43  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
44  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
45  * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
46  * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
47  *
48  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
49  * For example:
50  * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
51  *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
52  *
53  * The PMR will use BAR 4/5 exclusively.
54  *
55  * To place controller(s) and namespace(s) to a subsystem, then provide
56  * nvme-subsys device as above.
57  *
58  * nvme subsystem device parameters
59  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
60  * - `nqn`
61  *   This parameter provides the `<nqn_id>` part of the string
62  *   `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
63  *   of subsystem controllers. Note that `<nqn_id>` should be unique per
64  *   subsystem, but this is not enforced by QEMU. If not specified, it will
65  *   default to the value of the `id` parameter (`<subsys_id>`).
66  *
67  * nvme device parameters
68  * ~~~~~~~~~~~~~~~~~~~~~~
69  * - `subsys`
70  *   Specifying this parameter attaches the controller to the subsystem and
71  *   the SUBNQN field in the controller will report the NQN of the subsystem
72  *   device. This also enables multi controller capability represented in
73  *   Identify Controller data structure in CMIC (Controller Multi-path I/O and
74  *   Namesapce Sharing Capabilities).
75  *
76  * - `aerl`
77  *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
78  *   of concurrently outstanding Asynchronous Event Request commands support
79  *   by the controller. This is a 0's based value.
80  *
81  * - `aer_max_queued`
82  *   This is the maximum number of events that the device will enqueue for
83  *   completion when there are no outstanding AERs. When the maximum number of
84  *   enqueued events are reached, subsequent events will be dropped.
85  *
86  * - `mdts`
87  *   Indicates the maximum data transfer size for a command that transfers data
88  *   between host-accessible memory and the controller. The value is specified
89  *   as a power of two (2^n) and is in units of the minimum memory page size
90  *   (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
91  *
92  * - `vsl`
93  *   Indicates the maximum data size limit for the Verify command. Like `mdts`,
94  *   this value is specified as a power of two (2^n) and is in units of the
95  *   minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
96  *   KiB).
97  *
98  * - `zoned.zasl`
99  *   Indicates the maximum data transfer size for the Zone Append command. Like
100  *   `mdts`, the value is specified as a power of two (2^n) and is in units of
101  *   the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
102  *   defaulting to the value of `mdts`).
103  *
104  * - `zoned.auto_transition`
105  *   Indicates if zones in zone state implicitly opened can be automatically
106  *   transitioned to zone state closed for resource management purposes.
107  *   Defaults to 'on'.
108  *
109  * nvme namespace device parameters
110  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
111  * - `shared`
112  *   When the parent nvme device (as defined explicitly by the 'bus' parameter
113  *   or implicitly by the most recently defined NvmeBus) is linked to an
114  *   nvme-subsys device, the namespace will be attached to all controllers in
115  *   the subsystem. If set to 'off' (the default), the namespace will remain a
116  *   private namespace and may only be attached to a single controller at a
117  *   time.
118  *
119  * - `detached`
120  *   This parameter is only valid together with the `subsys` parameter. If left
121  *   at the default value (`false/off`), the namespace will be attached to all
122  *   controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
123  *   namespace will be be available in the subsystem not not attached to any
124  *   controllers.
125  *
126  * Setting `zoned` to true selects Zoned Command Set at the namespace.
127  * In this case, the following namespace properties are available to configure
128  * zoned operation:
129  *     zoned.zone_size=<zone size in bytes, default: 128MiB>
130  *         The number may be followed by K, M, G as in kilo-, mega- or giga-.
131  *
132  *     zoned.zone_capacity=<zone capacity in bytes, default: zone size>
133  *         The value 0 (default) forces zone capacity to be the same as zone
134  *         size. The value of this property may not exceed zone size.
135  *
136  *     zoned.descr_ext_size=<zone descriptor extension size, default 0>
137  *         This value needs to be specified in 64B units. If it is zero,
138  *         namespace(s) will not support zone descriptor extensions.
139  *
140  *     zoned.max_active=<Maximum Active Resources (zones), default: 0>
141  *         The default value means there is no limit to the number of
142  *         concurrently active zones.
143  *
144  *     zoned.max_open=<Maximum Open Resources (zones), default: 0>
145  *         The default value means there is no limit to the number of
146  *         concurrently open zones.
147  *
148  *     zoned.cross_read=<enable RAZB, default: false>
149  *         Setting this property to true enables Read Across Zone Boundaries.
150  */
151 
152 #include "qemu/osdep.h"
153 #include "qemu/cutils.h"
154 #include "qemu/error-report.h"
155 #include "qemu/log.h"
156 #include "qemu/units.h"
157 #include "qapi/error.h"
158 #include "qapi/visitor.h"
159 #include "sysemu/sysemu.h"
160 #include "sysemu/block-backend.h"
161 #include "sysemu/hostmem.h"
162 #include "hw/pci/msix.h"
163 #include "migration/vmstate.h"
164 
165 #include "nvme.h"
166 #include "trace.h"
167 
168 #define NVME_MAX_IOQPAIRS 0xffff
169 #define NVME_DB_SIZE  4
170 #define NVME_SPEC_VER 0x00010400
171 #define NVME_CMB_BIR 2
172 #define NVME_PMR_BIR 4
173 #define NVME_TEMPERATURE 0x143
174 #define NVME_TEMPERATURE_WARNING 0x157
175 #define NVME_TEMPERATURE_CRITICAL 0x175
176 #define NVME_NUM_FW_SLOTS 1
177 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
178 
179 #define NVME_GUEST_ERR(trace, fmt, ...) \
180     do { \
181         (trace_##trace)(__VA_ARGS__); \
182         qemu_log_mask(LOG_GUEST_ERROR, #trace \
183             " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
184     } while (0)
185 
186 static const bool nvme_feature_support[NVME_FID_MAX] = {
187     [NVME_ARBITRATION]              = true,
188     [NVME_POWER_MANAGEMENT]         = true,
189     [NVME_TEMPERATURE_THRESHOLD]    = true,
190     [NVME_ERROR_RECOVERY]           = true,
191     [NVME_VOLATILE_WRITE_CACHE]     = true,
192     [NVME_NUMBER_OF_QUEUES]         = true,
193     [NVME_INTERRUPT_COALESCING]     = true,
194     [NVME_INTERRUPT_VECTOR_CONF]    = true,
195     [NVME_WRITE_ATOMICITY]          = true,
196     [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
197     [NVME_TIMESTAMP]                = true,
198     [NVME_COMMAND_SET_PROFILE]      = true,
199 };
200 
201 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
202     [NVME_TEMPERATURE_THRESHOLD]    = NVME_FEAT_CAP_CHANGE,
203     [NVME_ERROR_RECOVERY]           = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
204     [NVME_VOLATILE_WRITE_CACHE]     = NVME_FEAT_CAP_CHANGE,
205     [NVME_NUMBER_OF_QUEUES]         = NVME_FEAT_CAP_CHANGE,
206     [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
207     [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
208     [NVME_COMMAND_SET_PROFILE]      = NVME_FEAT_CAP_CHANGE,
209 };
210 
211 static const uint32_t nvme_cse_acs[256] = {
212     [NVME_ADM_CMD_DELETE_SQ]        = NVME_CMD_EFF_CSUPP,
213     [NVME_ADM_CMD_CREATE_SQ]        = NVME_CMD_EFF_CSUPP,
214     [NVME_ADM_CMD_GET_LOG_PAGE]     = NVME_CMD_EFF_CSUPP,
215     [NVME_ADM_CMD_DELETE_CQ]        = NVME_CMD_EFF_CSUPP,
216     [NVME_ADM_CMD_CREATE_CQ]        = NVME_CMD_EFF_CSUPP,
217     [NVME_ADM_CMD_IDENTIFY]         = NVME_CMD_EFF_CSUPP,
218     [NVME_ADM_CMD_ABORT]            = NVME_CMD_EFF_CSUPP,
219     [NVME_ADM_CMD_SET_FEATURES]     = NVME_CMD_EFF_CSUPP,
220     [NVME_ADM_CMD_GET_FEATURES]     = NVME_CMD_EFF_CSUPP,
221     [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
222     [NVME_ADM_CMD_NS_ATTACHMENT]    = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
223     [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
224 };
225 
226 static const uint32_t nvme_cse_iocs_none[256];
227 
228 static const uint32_t nvme_cse_iocs_nvm[256] = {
229     [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
230     [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
231     [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
232     [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
233     [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
234     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
235     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
236     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
237 };
238 
239 static const uint32_t nvme_cse_iocs_zoned[256] = {
240     [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
241     [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
242     [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
243     [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
244     [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
245     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
246     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
247     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
248     [NVME_CMD_ZONE_APPEND]          = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
249     [NVME_CMD_ZONE_MGMT_SEND]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
250     [NVME_CMD_ZONE_MGMT_RECV]       = NVME_CMD_EFF_CSUPP,
251 };
252 
253 static void nvme_process_sq(void *opaque);
254 
255 static uint16_t nvme_sqid(NvmeRequest *req)
256 {
257     return le16_to_cpu(req->sq->sqid);
258 }
259 
260 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
261                                    NvmeZoneState state)
262 {
263     if (QTAILQ_IN_USE(zone, entry)) {
264         switch (nvme_get_zone_state(zone)) {
265         case NVME_ZONE_STATE_EXPLICITLY_OPEN:
266             QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
267             break;
268         case NVME_ZONE_STATE_IMPLICITLY_OPEN:
269             QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
270             break;
271         case NVME_ZONE_STATE_CLOSED:
272             QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
273             break;
274         case NVME_ZONE_STATE_FULL:
275             QTAILQ_REMOVE(&ns->full_zones, zone, entry);
276         default:
277             ;
278         }
279     }
280 
281     nvme_set_zone_state(zone, state);
282 
283     switch (state) {
284     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
285         QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
286         break;
287     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
288         QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
289         break;
290     case NVME_ZONE_STATE_CLOSED:
291         QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
292         break;
293     case NVME_ZONE_STATE_FULL:
294         QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
295     case NVME_ZONE_STATE_READ_ONLY:
296         break;
297     default:
298         zone->d.za = 0;
299     }
300 }
301 
302 /*
303  * Check if we can open a zone without exceeding open/active limits.
304  * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
305  */
306 static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
307 {
308     if (ns->params.max_active_zones != 0 &&
309         ns->nr_active_zones + act > ns->params.max_active_zones) {
310         trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
311         return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
312     }
313     if (ns->params.max_open_zones != 0 &&
314         ns->nr_open_zones + opn > ns->params.max_open_zones) {
315         trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
316         return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
317     }
318 
319     return NVME_SUCCESS;
320 }
321 
322 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
323 {
324     hwaddr hi, lo;
325 
326     if (!n->cmb.cmse) {
327         return false;
328     }
329 
330     lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
331     hi = lo + int128_get64(n->cmb.mem.size);
332 
333     return addr >= lo && addr < hi;
334 }
335 
336 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
337 {
338     hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
339     return &n->cmb.buf[addr - base];
340 }
341 
342 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
343 {
344     hwaddr hi;
345 
346     if (!n->pmr.cmse) {
347         return false;
348     }
349 
350     hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
351 
352     return addr >= n->pmr.cba && addr < hi;
353 }
354 
355 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
356 {
357     return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
358 }
359 
360 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
361 {
362     hwaddr hi = addr + size - 1;
363     if (hi < addr) {
364         return 1;
365     }
366 
367     if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
368         memcpy(buf, nvme_addr_to_cmb(n, addr), size);
369         return 0;
370     }
371 
372     if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
373         memcpy(buf, nvme_addr_to_pmr(n, addr), size);
374         return 0;
375     }
376 
377     return pci_dma_read(&n->parent_obj, addr, buf, size);
378 }
379 
380 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
381 {
382     hwaddr hi = addr + size - 1;
383     if (hi < addr) {
384         return 1;
385     }
386 
387     if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
388         memcpy(nvme_addr_to_cmb(n, addr), buf, size);
389         return 0;
390     }
391 
392     if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
393         memcpy(nvme_addr_to_pmr(n, addr), buf, size);
394         return 0;
395     }
396 
397     return pci_dma_write(&n->parent_obj, addr, buf, size);
398 }
399 
400 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
401 {
402     return nsid &&
403         (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
404 }
405 
406 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
407 {
408     return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
409 }
410 
411 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
412 {
413     return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
414 }
415 
416 static void nvme_inc_cq_tail(NvmeCQueue *cq)
417 {
418     cq->tail++;
419     if (cq->tail >= cq->size) {
420         cq->tail = 0;
421         cq->phase = !cq->phase;
422     }
423 }
424 
425 static void nvme_inc_sq_head(NvmeSQueue *sq)
426 {
427     sq->head = (sq->head + 1) % sq->size;
428 }
429 
430 static uint8_t nvme_cq_full(NvmeCQueue *cq)
431 {
432     return (cq->tail + 1) % cq->size == cq->head;
433 }
434 
435 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
436 {
437     return sq->head == sq->tail;
438 }
439 
440 static void nvme_irq_check(NvmeCtrl *n)
441 {
442     if (msix_enabled(&(n->parent_obj))) {
443         return;
444     }
445     if (~n->bar.intms & n->irq_status) {
446         pci_irq_assert(&n->parent_obj);
447     } else {
448         pci_irq_deassert(&n->parent_obj);
449     }
450 }
451 
452 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
453 {
454     if (cq->irq_enabled) {
455         if (msix_enabled(&(n->parent_obj))) {
456             trace_pci_nvme_irq_msix(cq->vector);
457             msix_notify(&(n->parent_obj), cq->vector);
458         } else {
459             trace_pci_nvme_irq_pin();
460             assert(cq->vector < 32);
461             n->irq_status |= 1 << cq->vector;
462             nvme_irq_check(n);
463         }
464     } else {
465         trace_pci_nvme_irq_masked();
466     }
467 }
468 
469 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
470 {
471     if (cq->irq_enabled) {
472         if (msix_enabled(&(n->parent_obj))) {
473             return;
474         } else {
475             assert(cq->vector < 32);
476             n->irq_status &= ~(1 << cq->vector);
477             nvme_irq_check(n);
478         }
479     }
480 }
481 
482 static void nvme_req_clear(NvmeRequest *req)
483 {
484     req->ns = NULL;
485     req->opaque = NULL;
486     req->aiocb = NULL;
487     memset(&req->cqe, 0x0, sizeof(req->cqe));
488     req->status = NVME_SUCCESS;
489 }
490 
491 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
492 {
493     if (dma) {
494         pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
495         sg->flags = NVME_SG_DMA;
496     } else {
497         qemu_iovec_init(&sg->iov, 0);
498     }
499 
500     sg->flags |= NVME_SG_ALLOC;
501 }
502 
503 static inline void nvme_sg_unmap(NvmeSg *sg)
504 {
505     if (!(sg->flags & NVME_SG_ALLOC)) {
506         return;
507     }
508 
509     if (sg->flags & NVME_SG_DMA) {
510         qemu_sglist_destroy(&sg->qsg);
511     } else {
512         qemu_iovec_destroy(&sg->iov);
513     }
514 
515     memset(sg, 0x0, sizeof(*sg));
516 }
517 
518 /*
519  * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
520  * holds both data and metadata. This function splits the data and metadata
521  * into two separate QSG/IOVs.
522  */
523 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
524                           NvmeSg *mdata)
525 {
526     NvmeSg *dst = data;
527     uint32_t trans_len, count = ns->lbasz;
528     uint64_t offset = 0;
529     bool dma = sg->flags & NVME_SG_DMA;
530     size_t sge_len;
531     size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
532     int sg_idx = 0;
533 
534     assert(sg->flags & NVME_SG_ALLOC);
535 
536     while (sg_len) {
537         sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
538 
539         trans_len = MIN(sg_len, count);
540         trans_len = MIN(trans_len, sge_len - offset);
541 
542         if (dst) {
543             if (dma) {
544                 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
545                                 trans_len);
546             } else {
547                 qemu_iovec_add(&dst->iov,
548                                sg->iov.iov[sg_idx].iov_base + offset,
549                                trans_len);
550             }
551         }
552 
553         sg_len -= trans_len;
554         count -= trans_len;
555         offset += trans_len;
556 
557         if (count == 0) {
558             dst = (dst == data) ? mdata : data;
559             count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
560         }
561 
562         if (sge_len == offset) {
563             offset = 0;
564             sg_idx++;
565         }
566     }
567 }
568 
569 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
570                                   size_t len)
571 {
572     if (!len) {
573         return NVME_SUCCESS;
574     }
575 
576     trace_pci_nvme_map_addr_cmb(addr, len);
577 
578     if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
579         return NVME_DATA_TRAS_ERROR;
580     }
581 
582     qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
583 
584     return NVME_SUCCESS;
585 }
586 
587 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
588                                   size_t len)
589 {
590     if (!len) {
591         return NVME_SUCCESS;
592     }
593 
594     if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
595         return NVME_DATA_TRAS_ERROR;
596     }
597 
598     qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
599 
600     return NVME_SUCCESS;
601 }
602 
603 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
604 {
605     bool cmb = false, pmr = false;
606 
607     if (!len) {
608         return NVME_SUCCESS;
609     }
610 
611     trace_pci_nvme_map_addr(addr, len);
612 
613     if (nvme_addr_is_cmb(n, addr)) {
614         cmb = true;
615     } else if (nvme_addr_is_pmr(n, addr)) {
616         pmr = true;
617     }
618 
619     if (cmb || pmr) {
620         if (sg->flags & NVME_SG_DMA) {
621             return NVME_INVALID_USE_OF_CMB | NVME_DNR;
622         }
623 
624         if (cmb) {
625             return nvme_map_addr_cmb(n, &sg->iov, addr, len);
626         } else {
627             return nvme_map_addr_pmr(n, &sg->iov, addr, len);
628         }
629     }
630 
631     if (!(sg->flags & NVME_SG_DMA)) {
632         return NVME_INVALID_USE_OF_CMB | NVME_DNR;
633     }
634 
635     qemu_sglist_add(&sg->qsg, addr, len);
636 
637     return NVME_SUCCESS;
638 }
639 
640 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
641 {
642     return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
643 }
644 
645 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
646                              uint64_t prp2, uint32_t len)
647 {
648     hwaddr trans_len = n->page_size - (prp1 % n->page_size);
649     trans_len = MIN(len, trans_len);
650     int num_prps = (len >> n->page_bits) + 1;
651     uint16_t status;
652     int ret;
653 
654     trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
655 
656     nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
657 
658     status = nvme_map_addr(n, sg, prp1, trans_len);
659     if (status) {
660         goto unmap;
661     }
662 
663     len -= trans_len;
664     if (len) {
665         if (len > n->page_size) {
666             uint64_t prp_list[n->max_prp_ents];
667             uint32_t nents, prp_trans;
668             int i = 0;
669 
670             /*
671              * The first PRP list entry, pointed to by PRP2 may contain offset.
672              * Hence, we need to calculate the number of entries in based on
673              * that offset.
674              */
675             nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
676             prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
677             ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
678             if (ret) {
679                 trace_pci_nvme_err_addr_read(prp2);
680                 status = NVME_DATA_TRAS_ERROR;
681                 goto unmap;
682             }
683             while (len != 0) {
684                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
685 
686                 if (i == nents - 1 && len > n->page_size) {
687                     if (unlikely(prp_ent & (n->page_size - 1))) {
688                         trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
689                         status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
690                         goto unmap;
691                     }
692 
693                     i = 0;
694                     nents = (len + n->page_size - 1) >> n->page_bits;
695                     nents = MIN(nents, n->max_prp_ents);
696                     prp_trans = nents * sizeof(uint64_t);
697                     ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
698                                          prp_trans);
699                     if (ret) {
700                         trace_pci_nvme_err_addr_read(prp_ent);
701                         status = NVME_DATA_TRAS_ERROR;
702                         goto unmap;
703                     }
704                     prp_ent = le64_to_cpu(prp_list[i]);
705                 }
706 
707                 if (unlikely(prp_ent & (n->page_size - 1))) {
708                     trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
709                     status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
710                     goto unmap;
711                 }
712 
713                 trans_len = MIN(len, n->page_size);
714                 status = nvme_map_addr(n, sg, prp_ent, trans_len);
715                 if (status) {
716                     goto unmap;
717                 }
718 
719                 len -= trans_len;
720                 i++;
721             }
722         } else {
723             if (unlikely(prp2 & (n->page_size - 1))) {
724                 trace_pci_nvme_err_invalid_prp2_align(prp2);
725                 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
726                 goto unmap;
727             }
728             status = nvme_map_addr(n, sg, prp2, len);
729             if (status) {
730                 goto unmap;
731             }
732         }
733     }
734 
735     return NVME_SUCCESS;
736 
737 unmap:
738     nvme_sg_unmap(sg);
739     return status;
740 }
741 
742 /*
743  * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
744  * number of bytes mapped in len.
745  */
746 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
747                                   NvmeSglDescriptor *segment, uint64_t nsgld,
748                                   size_t *len, NvmeCmd *cmd)
749 {
750     dma_addr_t addr, trans_len;
751     uint32_t dlen;
752     uint16_t status;
753 
754     for (int i = 0; i < nsgld; i++) {
755         uint8_t type = NVME_SGL_TYPE(segment[i].type);
756 
757         switch (type) {
758         case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
759             if (cmd->opcode == NVME_CMD_WRITE) {
760                 continue;
761             }
762         case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
763             break;
764         case NVME_SGL_DESCR_TYPE_SEGMENT:
765         case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
766             return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
767         default:
768             return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
769         }
770 
771         dlen = le32_to_cpu(segment[i].len);
772 
773         if (!dlen) {
774             continue;
775         }
776 
777         if (*len == 0) {
778             /*
779              * All data has been mapped, but the SGL contains additional
780              * segments and/or descriptors. The controller might accept
781              * ignoring the rest of the SGL.
782              */
783             uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
784             if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
785                 break;
786             }
787 
788             trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
789             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
790         }
791 
792         trans_len = MIN(*len, dlen);
793 
794         if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
795             goto next;
796         }
797 
798         addr = le64_to_cpu(segment[i].addr);
799 
800         if (UINT64_MAX - addr < dlen) {
801             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
802         }
803 
804         status = nvme_map_addr(n, sg, addr, trans_len);
805         if (status) {
806             return status;
807         }
808 
809 next:
810         *len -= trans_len;
811     }
812 
813     return NVME_SUCCESS;
814 }
815 
816 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
817                              size_t len, NvmeCmd *cmd)
818 {
819     /*
820      * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
821      * dynamically allocating a potentially huge SGL. The spec allows the SGL
822      * to be larger (as in number of bytes required to describe the SGL
823      * descriptors and segment chain) than the command transfer size, so it is
824      * not bounded by MDTS.
825      */
826     const int SEG_CHUNK_SIZE = 256;
827 
828     NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
829     uint64_t nsgld;
830     uint32_t seg_len;
831     uint16_t status;
832     hwaddr addr;
833     int ret;
834 
835     sgld = &sgl;
836     addr = le64_to_cpu(sgl.addr);
837 
838     trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
839 
840     nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
841 
842     /*
843      * If the entire transfer can be described with a single data block it can
844      * be mapped directly.
845      */
846     if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
847         status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
848         if (status) {
849             goto unmap;
850         }
851 
852         goto out;
853     }
854 
855     for (;;) {
856         switch (NVME_SGL_TYPE(sgld->type)) {
857         case NVME_SGL_DESCR_TYPE_SEGMENT:
858         case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
859             break;
860         default:
861             return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
862         }
863 
864         seg_len = le32_to_cpu(sgld->len);
865 
866         /* check the length of the (Last) Segment descriptor */
867         if ((!seg_len || seg_len & 0xf) &&
868             (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
869             return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
870         }
871 
872         if (UINT64_MAX - addr < seg_len) {
873             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
874         }
875 
876         nsgld = seg_len / sizeof(NvmeSglDescriptor);
877 
878         while (nsgld > SEG_CHUNK_SIZE) {
879             if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
880                 trace_pci_nvme_err_addr_read(addr);
881                 status = NVME_DATA_TRAS_ERROR;
882                 goto unmap;
883             }
884 
885             status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
886                                        &len, cmd);
887             if (status) {
888                 goto unmap;
889             }
890 
891             nsgld -= SEG_CHUNK_SIZE;
892             addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
893         }
894 
895         ret = nvme_addr_read(n, addr, segment, nsgld *
896                              sizeof(NvmeSglDescriptor));
897         if (ret) {
898             trace_pci_nvme_err_addr_read(addr);
899             status = NVME_DATA_TRAS_ERROR;
900             goto unmap;
901         }
902 
903         last_sgld = &segment[nsgld - 1];
904 
905         /*
906          * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
907          * then we are done.
908          */
909         switch (NVME_SGL_TYPE(last_sgld->type)) {
910         case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
911         case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
912             status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
913             if (status) {
914                 goto unmap;
915             }
916 
917             goto out;
918 
919         default:
920             break;
921         }
922 
923         /*
924          * If the last descriptor was not a Data Block or Bit Bucket, then the
925          * current segment must not be a Last Segment.
926          */
927         if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
928             status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
929             goto unmap;
930         }
931 
932         sgld = last_sgld;
933         addr = le64_to_cpu(sgld->addr);
934 
935         /*
936          * Do not map the last descriptor; it will be a Segment or Last Segment
937          * descriptor and is handled by the next iteration.
938          */
939         status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
940         if (status) {
941             goto unmap;
942         }
943     }
944 
945 out:
946     /* if there is any residual left in len, the SGL was too short */
947     if (len) {
948         status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
949         goto unmap;
950     }
951 
952     return NVME_SUCCESS;
953 
954 unmap:
955     nvme_sg_unmap(sg);
956     return status;
957 }
958 
959 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
960                        NvmeCmd *cmd)
961 {
962     uint64_t prp1, prp2;
963 
964     switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
965     case NVME_PSDT_PRP:
966         prp1 = le64_to_cpu(cmd->dptr.prp1);
967         prp2 = le64_to_cpu(cmd->dptr.prp2);
968 
969         return nvme_map_prp(n, sg, prp1, prp2, len);
970     case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
971     case NVME_PSDT_SGL_MPTR_SGL:
972         return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
973     default:
974         return NVME_INVALID_FIELD;
975     }
976 }
977 
978 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
979                               NvmeCmd *cmd)
980 {
981     int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
982     hwaddr mptr = le64_to_cpu(cmd->mptr);
983     uint16_t status;
984 
985     if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
986         NvmeSglDescriptor sgl;
987 
988         if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
989             return NVME_DATA_TRAS_ERROR;
990         }
991 
992         status = nvme_map_sgl(n, sg, sgl, len, cmd);
993         if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
994             status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
995         }
996 
997         return status;
998     }
999 
1000     nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1001     status = nvme_map_addr(n, sg, mptr, len);
1002     if (status) {
1003         nvme_sg_unmap(sg);
1004     }
1005 
1006     return status;
1007 }
1008 
1009 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1010 {
1011     NvmeNamespace *ns = req->ns;
1012     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1013     uint16_t ctrl = le16_to_cpu(rw->control);
1014     size_t len = nvme_l2b(ns, nlb);
1015     uint16_t status;
1016 
1017     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
1018         (ctrl & NVME_RW_PRINFO_PRACT && ns->lbaf.ms == 8)) {
1019         goto out;
1020     }
1021 
1022     if (nvme_ns_ext(ns)) {
1023         NvmeSg sg;
1024 
1025         len += nvme_m2b(ns, nlb);
1026 
1027         status = nvme_map_dptr(n, &sg, len, &req->cmd);
1028         if (status) {
1029             return status;
1030         }
1031 
1032         nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1033         nvme_sg_split(&sg, ns, &req->sg, NULL);
1034         nvme_sg_unmap(&sg);
1035 
1036         return NVME_SUCCESS;
1037     }
1038 
1039 out:
1040     return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1041 }
1042 
1043 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1044 {
1045     NvmeNamespace *ns = req->ns;
1046     size_t len = nvme_m2b(ns, nlb);
1047     uint16_t status;
1048 
1049     if (nvme_ns_ext(ns)) {
1050         NvmeSg sg;
1051 
1052         len += nvme_l2b(ns, nlb);
1053 
1054         status = nvme_map_dptr(n, &sg, len, &req->cmd);
1055         if (status) {
1056             return status;
1057         }
1058 
1059         nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1060         nvme_sg_split(&sg, ns, NULL, &req->sg);
1061         nvme_sg_unmap(&sg);
1062 
1063         return NVME_SUCCESS;
1064     }
1065 
1066     return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1067 }
1068 
1069 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1070                                     uint32_t len, uint32_t bytes,
1071                                     int32_t skip_bytes, int64_t offset,
1072                                     NvmeTxDirection dir)
1073 {
1074     hwaddr addr;
1075     uint32_t trans_len, count = bytes;
1076     bool dma = sg->flags & NVME_SG_DMA;
1077     int64_t sge_len;
1078     int sg_idx = 0;
1079     int ret;
1080 
1081     assert(sg->flags & NVME_SG_ALLOC);
1082 
1083     while (len) {
1084         sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1085 
1086         if (sge_len - offset < 0) {
1087             offset -= sge_len;
1088             sg_idx++;
1089             continue;
1090         }
1091 
1092         if (sge_len == offset) {
1093             offset = 0;
1094             sg_idx++;
1095             continue;
1096         }
1097 
1098         trans_len = MIN(len, count);
1099         trans_len = MIN(trans_len, sge_len - offset);
1100 
1101         if (dma) {
1102             addr = sg->qsg.sg[sg_idx].base + offset;
1103         } else {
1104             addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1105         }
1106 
1107         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1108             ret = nvme_addr_read(n, addr, ptr, trans_len);
1109         } else {
1110             ret = nvme_addr_write(n, addr, ptr, trans_len);
1111         }
1112 
1113         if (ret) {
1114             return NVME_DATA_TRAS_ERROR;
1115         }
1116 
1117         ptr += trans_len;
1118         len -= trans_len;
1119         count -= trans_len;
1120         offset += trans_len;
1121 
1122         if (count == 0) {
1123             count = bytes;
1124             offset += skip_bytes;
1125         }
1126     }
1127 
1128     return NVME_SUCCESS;
1129 }
1130 
1131 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
1132                         NvmeTxDirection dir)
1133 {
1134     assert(sg->flags & NVME_SG_ALLOC);
1135 
1136     if (sg->flags & NVME_SG_DMA) {
1137         uint64_t residual;
1138 
1139         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1140             residual = dma_buf_write(ptr, len, &sg->qsg);
1141         } else {
1142             residual = dma_buf_read(ptr, len, &sg->qsg);
1143         }
1144 
1145         if (unlikely(residual)) {
1146             trace_pci_nvme_err_invalid_dma();
1147             return NVME_INVALID_FIELD | NVME_DNR;
1148         }
1149     } else {
1150         size_t bytes;
1151 
1152         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1153             bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1154         } else {
1155             bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1156         }
1157 
1158         if (unlikely(bytes != len)) {
1159             trace_pci_nvme_err_invalid_dma();
1160             return NVME_INVALID_FIELD | NVME_DNR;
1161         }
1162     }
1163 
1164     return NVME_SUCCESS;
1165 }
1166 
1167 static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1168                                 NvmeRequest *req)
1169 {
1170     uint16_t status;
1171 
1172     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1173     if (status) {
1174         return status;
1175     }
1176 
1177     return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1178 }
1179 
1180 static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1181                                 NvmeRequest *req)
1182 {
1183     uint16_t status;
1184 
1185     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1186     if (status) {
1187         return status;
1188     }
1189 
1190     return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1191 }
1192 
1193 uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1194                           NvmeTxDirection dir, NvmeRequest *req)
1195 {
1196     NvmeNamespace *ns = req->ns;
1197     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1198     uint16_t ctrl = le16_to_cpu(rw->control);
1199 
1200     if (nvme_ns_ext(ns) &&
1201         !(ctrl & NVME_RW_PRINFO_PRACT && ns->lbaf.ms == 8)) {
1202         return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1203                                    ns->lbaf.ms, 0, dir);
1204     }
1205 
1206     return nvme_tx(n, &req->sg, ptr, len, dir);
1207 }
1208 
1209 uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1210                            NvmeTxDirection dir, NvmeRequest *req)
1211 {
1212     NvmeNamespace *ns = req->ns;
1213     uint16_t status;
1214 
1215     if (nvme_ns_ext(ns)) {
1216         return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1217                                    ns->lbasz, ns->lbasz, dir);
1218     }
1219 
1220     nvme_sg_unmap(&req->sg);
1221 
1222     status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1223     if (status) {
1224         return status;
1225     }
1226 
1227     return nvme_tx(n, &req->sg, ptr, len, dir);
1228 }
1229 
1230 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1231                                  BlockCompletionFunc *cb, NvmeRequest *req)
1232 {
1233     assert(req->sg.flags & NVME_SG_ALLOC);
1234 
1235     if (req->sg.flags & NVME_SG_DMA) {
1236         req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1237                                   cb, req);
1238     } else {
1239         req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1240     }
1241 }
1242 
1243 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1244                                   BlockCompletionFunc *cb, NvmeRequest *req)
1245 {
1246     assert(req->sg.flags & NVME_SG_ALLOC);
1247 
1248     if (req->sg.flags & NVME_SG_DMA) {
1249         req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1250                                    cb, req);
1251     } else {
1252         req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1253     }
1254 }
1255 
1256 static void nvme_post_cqes(void *opaque)
1257 {
1258     NvmeCQueue *cq = opaque;
1259     NvmeCtrl *n = cq->ctrl;
1260     NvmeRequest *req, *next;
1261     int ret;
1262 
1263     QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1264         NvmeSQueue *sq;
1265         hwaddr addr;
1266 
1267         if (nvme_cq_full(cq)) {
1268             break;
1269         }
1270 
1271         sq = req->sq;
1272         req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1273         req->cqe.sq_id = cpu_to_le16(sq->sqid);
1274         req->cqe.sq_head = cpu_to_le16(sq->head);
1275         addr = cq->dma_addr + cq->tail * n->cqe_size;
1276         ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1277                             sizeof(req->cqe));
1278         if (ret) {
1279             trace_pci_nvme_err_addr_write(addr);
1280             trace_pci_nvme_err_cfs();
1281             n->bar.csts = NVME_CSTS_FAILED;
1282             break;
1283         }
1284         QTAILQ_REMOVE(&cq->req_list, req, entry);
1285         nvme_inc_cq_tail(cq);
1286         nvme_sg_unmap(&req->sg);
1287         QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1288     }
1289     if (cq->tail != cq->head) {
1290         nvme_irq_assert(n, cq);
1291     }
1292 }
1293 
1294 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1295 {
1296     assert(cq->cqid == req->sq->cqid);
1297     trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1298                                           req->status);
1299 
1300     if (req->status) {
1301         trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1302                                       req->status, req->cmd.opcode);
1303     }
1304 
1305     QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1306     QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1307     timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1308 }
1309 
1310 static void nvme_process_aers(void *opaque)
1311 {
1312     NvmeCtrl *n = opaque;
1313     NvmeAsyncEvent *event, *next;
1314 
1315     trace_pci_nvme_process_aers(n->aer_queued);
1316 
1317     QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1318         NvmeRequest *req;
1319         NvmeAerResult *result;
1320 
1321         /* can't post cqe if there is nothing to complete */
1322         if (!n->outstanding_aers) {
1323             trace_pci_nvme_no_outstanding_aers();
1324             break;
1325         }
1326 
1327         /* ignore if masked (cqe posted, but event not cleared) */
1328         if (n->aer_mask & (1 << event->result.event_type)) {
1329             trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1330             continue;
1331         }
1332 
1333         QTAILQ_REMOVE(&n->aer_queue, event, entry);
1334         n->aer_queued--;
1335 
1336         n->aer_mask |= 1 << event->result.event_type;
1337         n->outstanding_aers--;
1338 
1339         req = n->aer_reqs[n->outstanding_aers];
1340 
1341         result = (NvmeAerResult *) &req->cqe.result;
1342         result->event_type = event->result.event_type;
1343         result->event_info = event->result.event_info;
1344         result->log_page = event->result.log_page;
1345         g_free(event);
1346 
1347         trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1348                                     result->log_page);
1349 
1350         nvme_enqueue_req_completion(&n->admin_cq, req);
1351     }
1352 }
1353 
1354 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1355                                uint8_t event_info, uint8_t log_page)
1356 {
1357     NvmeAsyncEvent *event;
1358 
1359     trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1360 
1361     if (n->aer_queued == n->params.aer_max_queued) {
1362         trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1363         return;
1364     }
1365 
1366     event = g_new(NvmeAsyncEvent, 1);
1367     event->result = (NvmeAerResult) {
1368         .event_type = event_type,
1369         .event_info = event_info,
1370         .log_page   = log_page,
1371     };
1372 
1373     QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1374     n->aer_queued++;
1375 
1376     nvme_process_aers(n);
1377 }
1378 
1379 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1380 {
1381     uint8_t aer_info;
1382 
1383     /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1384     if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1385         return;
1386     }
1387 
1388     switch (event) {
1389     case NVME_SMART_SPARE:
1390         aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1391         break;
1392     case NVME_SMART_TEMPERATURE:
1393         aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1394         break;
1395     case NVME_SMART_RELIABILITY:
1396     case NVME_SMART_MEDIA_READ_ONLY:
1397     case NVME_SMART_FAILED_VOLATILE_MEDIA:
1398     case NVME_SMART_PMR_UNRELIABLE:
1399         aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1400         break;
1401     default:
1402         return;
1403     }
1404 
1405     nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1406 }
1407 
1408 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1409 {
1410     n->aer_mask &= ~(1 << event_type);
1411     if (!QTAILQ_EMPTY(&n->aer_queue)) {
1412         nvme_process_aers(n);
1413     }
1414 }
1415 
1416 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1417 {
1418     uint8_t mdts = n->params.mdts;
1419 
1420     if (mdts && len > n->page_size << mdts) {
1421         trace_pci_nvme_err_mdts(len);
1422         return NVME_INVALID_FIELD | NVME_DNR;
1423     }
1424 
1425     return NVME_SUCCESS;
1426 }
1427 
1428 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1429                                          uint32_t nlb)
1430 {
1431     uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1432 
1433     if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1434         trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1435         return NVME_LBA_RANGE | NVME_DNR;
1436     }
1437 
1438     return NVME_SUCCESS;
1439 }
1440 
1441 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1442                                  uint32_t nlb)
1443 {
1444     BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1445 
1446     int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1447     int64_t offset = nvme_l2b(ns, slba);
1448     bool zeroed;
1449     int ret;
1450 
1451     Error *local_err = NULL;
1452 
1453     /*
1454      * `pnum` holds the number of bytes after offset that shares the same
1455      * allocation status as the byte at offset. If `pnum` is different from
1456      * `bytes`, we should check the allocation status of the next range and
1457      * continue this until all bytes have been checked.
1458      */
1459     do {
1460         bytes -= pnum;
1461 
1462         ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1463         if (ret < 0) {
1464             error_setg_errno(&local_err, -ret, "unable to get block status");
1465             error_report_err(local_err);
1466 
1467             return NVME_INTERNAL_DEV_ERROR;
1468         }
1469 
1470         zeroed = !!(ret & BDRV_BLOCK_ZERO);
1471 
1472         trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);
1473 
1474         if (zeroed) {
1475             return NVME_DULB;
1476         }
1477 
1478         offset += pnum;
1479     } while (pnum != bytes);
1480 
1481     return NVME_SUCCESS;
1482 }
1483 
1484 static void nvme_aio_err(NvmeRequest *req, int ret)
1485 {
1486     uint16_t status = NVME_SUCCESS;
1487     Error *local_err = NULL;
1488 
1489     switch (req->cmd.opcode) {
1490     case NVME_CMD_READ:
1491         status = NVME_UNRECOVERED_READ;
1492         break;
1493     case NVME_CMD_FLUSH:
1494     case NVME_CMD_WRITE:
1495     case NVME_CMD_WRITE_ZEROES:
1496     case NVME_CMD_ZONE_APPEND:
1497         status = NVME_WRITE_FAULT;
1498         break;
1499     default:
1500         status = NVME_INTERNAL_DEV_ERROR;
1501         break;
1502     }
1503 
1504     trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1505 
1506     error_setg_errno(&local_err, -ret, "aio failed");
1507     error_report_err(local_err);
1508 
1509     /*
1510      * Set the command status code to the first encountered error but allow a
1511      * subsequent Internal Device Error to trump it.
1512      */
1513     if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1514         return;
1515     }
1516 
1517     req->status = status;
1518 }
1519 
1520 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1521 {
1522     return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1523                                     slba / ns->zone_size;
1524 }
1525 
1526 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1527 {
1528     uint32_t zone_idx = nvme_zone_idx(ns, slba);
1529 
1530     assert(zone_idx < ns->num_zones);
1531     return &ns->zone_array[zone_idx];
1532 }
1533 
1534 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1535 {
1536     uint64_t zslba = zone->d.zslba;
1537 
1538     switch (nvme_get_zone_state(zone)) {
1539     case NVME_ZONE_STATE_EMPTY:
1540     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1541     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1542     case NVME_ZONE_STATE_CLOSED:
1543         return NVME_SUCCESS;
1544     case NVME_ZONE_STATE_FULL:
1545         trace_pci_nvme_err_zone_is_full(zslba);
1546         return NVME_ZONE_FULL;
1547     case NVME_ZONE_STATE_OFFLINE:
1548         trace_pci_nvme_err_zone_is_offline(zslba);
1549         return NVME_ZONE_OFFLINE;
1550     case NVME_ZONE_STATE_READ_ONLY:
1551         trace_pci_nvme_err_zone_is_read_only(zslba);
1552         return NVME_ZONE_READ_ONLY;
1553     default:
1554         assert(false);
1555     }
1556 
1557     return NVME_INTERNAL_DEV_ERROR;
1558 }
1559 
1560 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1561                                       uint64_t slba, uint32_t nlb)
1562 {
1563     uint64_t zcap = nvme_zone_wr_boundary(zone);
1564     uint16_t status;
1565 
1566     status = nvme_check_zone_state_for_write(zone);
1567     if (status) {
1568         return status;
1569     }
1570 
1571     if (unlikely(slba != zone->w_ptr)) {
1572         trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
1573         return NVME_ZONE_INVALID_WRITE;
1574     }
1575 
1576     if (unlikely((slba + nlb) > zcap)) {
1577         trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1578         return NVME_ZONE_BOUNDARY_ERROR;
1579     }
1580 
1581     return NVME_SUCCESS;
1582 }
1583 
1584 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1585 {
1586     switch (nvme_get_zone_state(zone)) {
1587     case NVME_ZONE_STATE_EMPTY:
1588     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1589     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1590     case NVME_ZONE_STATE_FULL:
1591     case NVME_ZONE_STATE_CLOSED:
1592     case NVME_ZONE_STATE_READ_ONLY:
1593         return NVME_SUCCESS;
1594     case NVME_ZONE_STATE_OFFLINE:
1595         trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1596         return NVME_ZONE_OFFLINE;
1597     default:
1598         assert(false);
1599     }
1600 
1601     return NVME_INTERNAL_DEV_ERROR;
1602 }
1603 
1604 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1605                                      uint32_t nlb)
1606 {
1607     NvmeZone *zone = nvme_get_zone_by_slba(ns, slba);
1608     uint64_t bndry = nvme_zone_rd_boundary(ns, zone);
1609     uint64_t end = slba + nlb;
1610     uint16_t status;
1611 
1612     status = nvme_check_zone_state_for_read(zone);
1613     if (status) {
1614         ;
1615     } else if (unlikely(end > bndry)) {
1616         if (!ns->params.cross_zone_read) {
1617             status = NVME_ZONE_BOUNDARY_ERROR;
1618         } else {
1619             /*
1620              * Read across zone boundary - check that all subsequent
1621              * zones that are being read have an appropriate state.
1622              */
1623             do {
1624                 zone++;
1625                 status = nvme_check_zone_state_for_read(zone);
1626                 if (status) {
1627                     break;
1628                 }
1629             } while (end > nvme_zone_rd_boundary(ns, zone));
1630         }
1631     }
1632 
1633     return status;
1634 }
1635 
1636 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1637 {
1638     switch (nvme_get_zone_state(zone)) {
1639     case NVME_ZONE_STATE_FULL:
1640         return NVME_SUCCESS;
1641 
1642     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1643     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1644         nvme_aor_dec_open(ns);
1645         /* fallthrough */
1646     case NVME_ZONE_STATE_CLOSED:
1647         nvme_aor_dec_active(ns);
1648         /* fallthrough */
1649     case NVME_ZONE_STATE_EMPTY:
1650         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1651         return NVME_SUCCESS;
1652 
1653     default:
1654         return NVME_ZONE_INVAL_TRANSITION;
1655     }
1656 }
1657 
1658 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1659 {
1660     switch (nvme_get_zone_state(zone)) {
1661     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1662     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1663         nvme_aor_dec_open(ns);
1664         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1665         /* fall through */
1666     case NVME_ZONE_STATE_CLOSED:
1667         return NVME_SUCCESS;
1668 
1669     default:
1670         return NVME_ZONE_INVAL_TRANSITION;
1671     }
1672 }
1673 
1674 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1675 {
1676     NvmeZone *zone;
1677 
1678     if (ns->params.max_open_zones &&
1679         ns->nr_open_zones == ns->params.max_open_zones) {
1680         zone = QTAILQ_FIRST(&ns->imp_open_zones);
1681         if (zone) {
1682             /*
1683              * Automatically close this implicitly open zone.
1684              */
1685             QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1686             nvme_zrm_close(ns, zone);
1687         }
1688     }
1689 }
1690 
1691 enum {
1692     NVME_ZRM_AUTO = 1 << 0,
1693 };
1694 
1695 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1696                                     NvmeZone *zone, int flags)
1697 {
1698     int act = 0;
1699     uint16_t status;
1700 
1701     switch (nvme_get_zone_state(zone)) {
1702     case NVME_ZONE_STATE_EMPTY:
1703         act = 1;
1704 
1705         /* fallthrough */
1706 
1707     case NVME_ZONE_STATE_CLOSED:
1708         if (n->params.auto_transition_zones) {
1709             nvme_zrm_auto_transition_zone(ns);
1710         }
1711         status = nvme_aor_check(ns, act, 1);
1712         if (status) {
1713             return status;
1714         }
1715 
1716         if (act) {
1717             nvme_aor_inc_active(ns);
1718         }
1719 
1720         nvme_aor_inc_open(ns);
1721 
1722         if (flags & NVME_ZRM_AUTO) {
1723             nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1724             return NVME_SUCCESS;
1725         }
1726 
1727         /* fallthrough */
1728 
1729     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1730         if (flags & NVME_ZRM_AUTO) {
1731             return NVME_SUCCESS;
1732         }
1733 
1734         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1735 
1736         /* fallthrough */
1737 
1738     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1739         return NVME_SUCCESS;
1740 
1741     default:
1742         return NVME_ZONE_INVAL_TRANSITION;
1743     }
1744 }
1745 
1746 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
1747                                      NvmeZone *zone)
1748 {
1749     return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
1750 }
1751 
1752 static inline uint16_t nvme_zrm_open(NvmeCtrl *n, NvmeNamespace *ns,
1753                                      NvmeZone *zone)
1754 {
1755     return nvme_zrm_open_flags(n, ns, zone, 0);
1756 }
1757 
1758 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1759                                  uint32_t nlb)
1760 {
1761     zone->d.wp += nlb;
1762 
1763     if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1764         nvme_zrm_finish(ns, zone);
1765     }
1766 }
1767 
1768 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1769 {
1770     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1771     NvmeZone *zone;
1772     uint64_t slba;
1773     uint32_t nlb;
1774 
1775     slba = le64_to_cpu(rw->slba);
1776     nlb = le16_to_cpu(rw->nlb) + 1;
1777     zone = nvme_get_zone_by_slba(ns, slba);
1778 
1779     nvme_advance_zone_wp(ns, zone, nlb);
1780 }
1781 
1782 static inline bool nvme_is_write(NvmeRequest *req)
1783 {
1784     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1785 
1786     return rw->opcode == NVME_CMD_WRITE ||
1787            rw->opcode == NVME_CMD_ZONE_APPEND ||
1788            rw->opcode == NVME_CMD_WRITE_ZEROES;
1789 }
1790 
1791 static void nvme_misc_cb(void *opaque, int ret)
1792 {
1793     NvmeRequest *req = opaque;
1794     NvmeNamespace *ns = req->ns;
1795 
1796     BlockBackend *blk = ns->blkconf.blk;
1797     BlockAcctCookie *acct = &req->acct;
1798     BlockAcctStats *stats = blk_get_stats(blk);
1799 
1800     trace_pci_nvme_misc_cb(nvme_cid(req), blk_name(blk));
1801 
1802     if (ret) {
1803         block_acct_failed(stats, acct);
1804         nvme_aio_err(req, ret);
1805     } else {
1806         block_acct_done(stats, acct);
1807     }
1808 
1809     nvme_enqueue_req_completion(nvme_cq(req), req);
1810 }
1811 
1812 void nvme_rw_complete_cb(void *opaque, int ret)
1813 {
1814     NvmeRequest *req = opaque;
1815     NvmeNamespace *ns = req->ns;
1816     BlockBackend *blk = ns->blkconf.blk;
1817     BlockAcctCookie *acct = &req->acct;
1818     BlockAcctStats *stats = blk_get_stats(blk);
1819 
1820     trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1821 
1822     if (ret) {
1823         block_acct_failed(stats, acct);
1824         nvme_aio_err(req, ret);
1825     } else {
1826         block_acct_done(stats, acct);
1827     }
1828 
1829     if (ns->params.zoned && nvme_is_write(req)) {
1830         nvme_finalize_zoned_write(ns, req);
1831     }
1832 
1833     nvme_enqueue_req_completion(nvme_cq(req), req);
1834 }
1835 
1836 static void nvme_rw_cb(void *opaque, int ret)
1837 {
1838     NvmeRequest *req = opaque;
1839     NvmeNamespace *ns = req->ns;
1840 
1841     BlockBackend *blk = ns->blkconf.blk;
1842 
1843     trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1844 
1845     if (ret) {
1846         goto out;
1847     }
1848 
1849     if (ns->lbaf.ms) {
1850         NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1851         uint64_t slba = le64_to_cpu(rw->slba);
1852         uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
1853         uint64_t offset = nvme_moff(ns, slba);
1854 
1855         if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
1856             size_t mlen = nvme_m2b(ns, nlb);
1857 
1858             req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
1859                                                BDRV_REQ_MAY_UNMAP,
1860                                                nvme_rw_complete_cb, req);
1861             return;
1862         }
1863 
1864         if (nvme_ns_ext(ns) || req->cmd.mptr) {
1865             uint16_t status;
1866 
1867             nvme_sg_unmap(&req->sg);
1868             status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
1869             if (status) {
1870                 ret = -EFAULT;
1871                 goto out;
1872             }
1873 
1874             if (req->cmd.opcode == NVME_CMD_READ) {
1875                 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
1876             }
1877 
1878             return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
1879         }
1880     }
1881 
1882 out:
1883     nvme_rw_complete_cb(req, ret);
1884 }
1885 
1886 struct nvme_aio_format_ctx {
1887     NvmeRequest   *req;
1888     NvmeNamespace *ns;
1889 
1890     /* number of outstanding write zeroes for this namespace */
1891     int *count;
1892 };
1893 
1894 static void nvme_aio_format_cb(void *opaque, int ret)
1895 {
1896     struct nvme_aio_format_ctx *ctx = opaque;
1897     NvmeRequest *req = ctx->req;
1898     NvmeNamespace *ns = ctx->ns;
1899     uintptr_t *num_formats = (uintptr_t *)&req->opaque;
1900     int *count = ctx->count;
1901 
1902     g_free(ctx);
1903 
1904     if (ret) {
1905         nvme_aio_err(req, ret);
1906     }
1907 
1908     if (--(*count)) {
1909         return;
1910     }
1911 
1912     g_free(count);
1913     ns->status = 0x0;
1914 
1915     if (--(*num_formats)) {
1916         return;
1917     }
1918 
1919     nvme_enqueue_req_completion(nvme_cq(req), req);
1920 }
1921 
1922 struct nvme_aio_flush_ctx {
1923     NvmeRequest     *req;
1924     NvmeNamespace   *ns;
1925     BlockAcctCookie acct;
1926 };
1927 
1928 static void nvme_aio_flush_cb(void *opaque, int ret)
1929 {
1930     struct nvme_aio_flush_ctx *ctx = opaque;
1931     NvmeRequest *req = ctx->req;
1932     uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
1933 
1934     BlockBackend *blk = ctx->ns->blkconf.blk;
1935     BlockAcctCookie *acct = &ctx->acct;
1936     BlockAcctStats *stats = blk_get_stats(blk);
1937 
1938     trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk));
1939 
1940     if (!ret) {
1941         block_acct_done(stats, acct);
1942     } else {
1943         block_acct_failed(stats, acct);
1944         nvme_aio_err(req, ret);
1945     }
1946 
1947     (*num_flushes)--;
1948     g_free(ctx);
1949 
1950     if (*num_flushes) {
1951         return;
1952     }
1953 
1954     nvme_enqueue_req_completion(nvme_cq(req), req);
1955 }
1956 
1957 static void nvme_verify_cb(void *opaque, int ret)
1958 {
1959     NvmeBounceContext *ctx = opaque;
1960     NvmeRequest *req = ctx->req;
1961     NvmeNamespace *ns = req->ns;
1962     BlockBackend *blk = ns->blkconf.blk;
1963     BlockAcctCookie *acct = &req->acct;
1964     BlockAcctStats *stats = blk_get_stats(blk);
1965     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1966     uint64_t slba = le64_to_cpu(rw->slba);
1967     uint16_t ctrl = le16_to_cpu(rw->control);
1968     uint16_t apptag = le16_to_cpu(rw->apptag);
1969     uint16_t appmask = le16_to_cpu(rw->appmask);
1970     uint32_t reftag = le32_to_cpu(rw->reftag);
1971     uint16_t status;
1972 
1973     trace_pci_nvme_verify_cb(nvme_cid(req), NVME_RW_PRINFO(ctrl), apptag,
1974                              appmask, reftag);
1975 
1976     if (ret) {
1977         block_acct_failed(stats, acct);
1978         nvme_aio_err(req, ret);
1979         goto out;
1980     }
1981 
1982     block_acct_done(stats, acct);
1983 
1984     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
1985         status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
1986                                        ctx->mdata.iov.size, slba);
1987         if (status) {
1988             req->status = status;
1989             goto out;
1990         }
1991 
1992         req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
1993                                      ctx->mdata.bounce, ctx->mdata.iov.size,
1994                                      ctrl, slba, apptag, appmask, reftag);
1995     }
1996 
1997 out:
1998     qemu_iovec_destroy(&ctx->data.iov);
1999     g_free(ctx->data.bounce);
2000 
2001     qemu_iovec_destroy(&ctx->mdata.iov);
2002     g_free(ctx->mdata.bounce);
2003 
2004     g_free(ctx);
2005 
2006     nvme_enqueue_req_completion(nvme_cq(req), req);
2007 }
2008 
2009 
2010 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2011 {
2012     NvmeBounceContext *ctx = opaque;
2013     NvmeRequest *req = ctx->req;
2014     NvmeNamespace *ns = req->ns;
2015     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2016     uint64_t slba = le64_to_cpu(rw->slba);
2017     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2018     size_t mlen = nvme_m2b(ns, nlb);
2019     uint64_t offset = nvme_moff(ns, slba);
2020     BlockBackend *blk = ns->blkconf.blk;
2021 
2022     trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2023 
2024     if (ret) {
2025         goto out;
2026     }
2027 
2028     ctx->mdata.bounce = g_malloc(mlen);
2029 
2030     qemu_iovec_reset(&ctx->mdata.iov);
2031     qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2032 
2033     req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2034                                 nvme_verify_cb, ctx);
2035     return;
2036 
2037 out:
2038     nvme_verify_cb(ctx, ret);
2039 }
2040 
2041 static void nvme_aio_discard_cb(void *opaque, int ret)
2042 {
2043     NvmeRequest *req = opaque;
2044     uintptr_t *discards = (uintptr_t *)&req->opaque;
2045 
2046     trace_pci_nvme_aio_discard_cb(nvme_cid(req));
2047 
2048     if (ret) {
2049         nvme_aio_err(req, ret);
2050     }
2051 
2052     (*discards)--;
2053 
2054     if (*discards) {
2055         return;
2056     }
2057 
2058     nvme_enqueue_req_completion(nvme_cq(req), req);
2059 }
2060 
2061 struct nvme_zone_reset_ctx {
2062     NvmeRequest *req;
2063     NvmeZone    *zone;
2064 };
2065 
2066 static void nvme_aio_zone_reset_complete_cb(void *opaque, int ret)
2067 {
2068     struct nvme_zone_reset_ctx *ctx = opaque;
2069     NvmeRequest *req = ctx->req;
2070     NvmeNamespace *ns = req->ns;
2071     NvmeZone *zone = ctx->zone;
2072     uintptr_t *resets = (uintptr_t *)&req->opaque;
2073 
2074     if (ret) {
2075         nvme_aio_err(req, ret);
2076         goto out;
2077     }
2078 
2079     switch (nvme_get_zone_state(zone)) {
2080     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2081     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2082         nvme_aor_dec_open(ns);
2083         /* fall through */
2084     case NVME_ZONE_STATE_CLOSED:
2085         nvme_aor_dec_active(ns);
2086         /* fall through */
2087     case NVME_ZONE_STATE_FULL:
2088         zone->w_ptr = zone->d.zslba;
2089         zone->d.wp = zone->w_ptr;
2090         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
2091         /* fall through */
2092     default:
2093         break;
2094     }
2095 
2096 out:
2097     g_free(ctx);
2098 
2099     (*resets)--;
2100 
2101     if (*resets) {
2102         return;
2103     }
2104 
2105     nvme_enqueue_req_completion(nvme_cq(req), req);
2106 }
2107 
2108 static void nvme_aio_zone_reset_cb(void *opaque, int ret)
2109 {
2110     struct nvme_zone_reset_ctx *ctx = opaque;
2111     NvmeRequest *req = ctx->req;
2112     NvmeNamespace *ns = req->ns;
2113     NvmeZone *zone = ctx->zone;
2114 
2115     trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);
2116 
2117     if (ret) {
2118         goto out;
2119     }
2120 
2121     if (ns->lbaf.ms) {
2122         int64_t offset = nvme_moff(ns, zone->d.zslba);
2123 
2124         blk_aio_pwrite_zeroes(ns->blkconf.blk, offset,
2125                               nvme_m2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
2126                               nvme_aio_zone_reset_complete_cb, ctx);
2127         return;
2128     }
2129 
2130 out:
2131     nvme_aio_zone_reset_complete_cb(opaque, ret);
2132 }
2133 
2134 struct nvme_copy_ctx {
2135     int copies;
2136     uint8_t *bounce;
2137     uint8_t *mbounce;
2138     uint32_t nlb;
2139     NvmeCopySourceRange *ranges;
2140 };
2141 
2142 struct nvme_copy_in_ctx {
2143     NvmeRequest *req;
2144     QEMUIOVector iov;
2145     NvmeCopySourceRange *range;
2146 };
2147 
2148 static void nvme_copy_complete_cb(void *opaque, int ret)
2149 {
2150     NvmeRequest *req = opaque;
2151     NvmeNamespace *ns = req->ns;
2152     struct nvme_copy_ctx *ctx = req->opaque;
2153 
2154     if (ret) {
2155         block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
2156         nvme_aio_err(req, ret);
2157         goto out;
2158     }
2159 
2160     block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
2161 
2162 out:
2163     if (ns->params.zoned) {
2164         NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2165         uint64_t sdlba = le64_to_cpu(copy->sdlba);
2166         NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
2167 
2168         nvme_advance_zone_wp(ns, zone, ctx->nlb);
2169     }
2170 
2171     g_free(ctx->bounce);
2172     g_free(ctx->mbounce);
2173     g_free(ctx);
2174 
2175     nvme_enqueue_req_completion(nvme_cq(req), req);
2176 }
2177 
2178 static void nvme_copy_cb(void *opaque, int ret)
2179 {
2180     NvmeRequest *req = opaque;
2181     NvmeNamespace *ns = req->ns;
2182     struct nvme_copy_ctx *ctx = req->opaque;
2183 
2184     trace_pci_nvme_copy_cb(nvme_cid(req));
2185 
2186     if (ret) {
2187         goto out;
2188     }
2189 
2190     if (ns->lbaf.ms) {
2191         NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2192         uint64_t sdlba = le64_to_cpu(copy->sdlba);
2193         int64_t offset = nvme_moff(ns, sdlba);
2194 
2195         qemu_iovec_reset(&req->sg.iov);
2196         qemu_iovec_add(&req->sg.iov, ctx->mbounce, nvme_m2b(ns, ctx->nlb));
2197 
2198         req->aiocb = blk_aio_pwritev(ns->blkconf.blk, offset, &req->sg.iov, 0,
2199                                      nvme_copy_complete_cb, req);
2200         return;
2201     }
2202 
2203 out:
2204     nvme_copy_complete_cb(opaque, ret);
2205 }
2206 
2207 static void nvme_copy_in_complete(NvmeRequest *req)
2208 {
2209     NvmeNamespace *ns = req->ns;
2210     NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2211     struct nvme_copy_ctx *ctx = req->opaque;
2212     uint64_t sdlba = le64_to_cpu(copy->sdlba);
2213     uint16_t status;
2214 
2215     trace_pci_nvme_copy_in_complete(nvme_cid(req));
2216 
2217     block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
2218 
2219     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2220         uint16_t prinfor = (copy->control[0] >> 4) & 0xf;
2221         uint16_t prinfow = (copy->control[2] >> 2) & 0xf;
2222         uint16_t nr = copy->nr + 1;
2223         NvmeCopySourceRange *range;
2224         uint64_t slba;
2225         uint32_t nlb;
2226         uint16_t apptag, appmask;
2227         uint32_t reftag;
2228         uint8_t *buf = ctx->bounce, *mbuf = ctx->mbounce;
2229         size_t len, mlen;
2230         int i;
2231 
2232         /*
2233          * The dif helpers expects prinfo to be similar to the control field of
2234          * the NvmeRwCmd, so shift by 10 to fake it.
2235          */
2236         prinfor = prinfor << 10;
2237         prinfow = prinfow << 10;
2238 
2239         for (i = 0; i < nr; i++) {
2240             range = &ctx->ranges[i];
2241             slba = le64_to_cpu(range->slba);
2242             nlb = le16_to_cpu(range->nlb) + 1;
2243             len = nvme_l2b(ns, nlb);
2244             mlen = nvme_m2b(ns, nlb);
2245             apptag = le16_to_cpu(range->apptag);
2246             appmask = le16_to_cpu(range->appmask);
2247             reftag = le32_to_cpu(range->reftag);
2248 
2249             status = nvme_dif_check(ns, buf, len, mbuf, mlen, prinfor, slba,
2250                                     apptag, appmask, reftag);
2251             if (status) {
2252                 goto invalid;
2253             }
2254 
2255             buf += len;
2256             mbuf += mlen;
2257         }
2258 
2259         apptag = le16_to_cpu(copy->apptag);
2260         appmask = le16_to_cpu(copy->appmask);
2261         reftag = le32_to_cpu(copy->reftag);
2262 
2263         if (prinfow & NVME_RW_PRINFO_PRACT) {
2264             size_t len = nvme_l2b(ns, ctx->nlb);
2265             size_t mlen = nvme_m2b(ns, ctx->nlb);
2266 
2267             status = nvme_check_prinfo(ns, prinfow, sdlba, reftag);
2268             if (status) {
2269                 goto invalid;
2270             }
2271 
2272             nvme_dif_pract_generate_dif(ns, ctx->bounce, len, ctx->mbounce,
2273                                         mlen, apptag, reftag);
2274         } else {
2275             status = nvme_dif_check(ns, ctx->bounce, len, ctx->mbounce, mlen,
2276                                     prinfow, sdlba, apptag, appmask, reftag);
2277             if (status) {
2278                 goto invalid;
2279             }
2280         }
2281     }
2282 
2283     status = nvme_check_bounds(ns, sdlba, ctx->nlb);
2284     if (status) {
2285         goto invalid;
2286     }
2287 
2288     if (ns->params.zoned) {
2289         NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
2290 
2291         status = nvme_check_zone_write(ns, zone, sdlba, ctx->nlb);
2292         if (status) {
2293             goto invalid;
2294         }
2295 
2296         status = nvme_zrm_auto(nvme_ctrl(req), ns, zone);
2297         if (status) {
2298             goto invalid;
2299         }
2300 
2301         zone->w_ptr += ctx->nlb;
2302     }
2303 
2304     qemu_iovec_init(&req->sg.iov, 1);
2305     qemu_iovec_add(&req->sg.iov, ctx->bounce, nvme_l2b(ns, ctx->nlb));
2306 
2307     block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
2308                      BLOCK_ACCT_WRITE);
2309 
2310     req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba),
2311                                  &req->sg.iov, 0, nvme_copy_cb, req);
2312 
2313     return;
2314 
2315 invalid:
2316     req->status = status;
2317 
2318     g_free(ctx->bounce);
2319     g_free(ctx);
2320 
2321     nvme_enqueue_req_completion(nvme_cq(req), req);
2322 }
2323 
2324 static void nvme_aio_copy_in_cb(void *opaque, int ret)
2325 {
2326     struct nvme_copy_in_ctx *in_ctx = opaque;
2327     NvmeRequest *req = in_ctx->req;
2328     NvmeNamespace *ns = req->ns;
2329     struct nvme_copy_ctx *ctx = req->opaque;
2330 
2331     qemu_iovec_destroy(&in_ctx->iov);
2332     g_free(in_ctx);
2333 
2334     trace_pci_nvme_aio_copy_in_cb(nvme_cid(req));
2335 
2336     if (ret) {
2337         nvme_aio_err(req, ret);
2338     }
2339 
2340     ctx->copies--;
2341 
2342     if (ctx->copies) {
2343         return;
2344     }
2345 
2346     if (req->status) {
2347         block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
2348 
2349         g_free(ctx->bounce);
2350         g_free(ctx->mbounce);
2351         g_free(ctx);
2352 
2353         nvme_enqueue_req_completion(nvme_cq(req), req);
2354 
2355         return;
2356     }
2357 
2358     nvme_copy_in_complete(req);
2359 }
2360 
2361 struct nvme_compare_ctx {
2362     struct {
2363         QEMUIOVector iov;
2364         uint8_t *bounce;
2365     } data;
2366 
2367     struct {
2368         QEMUIOVector iov;
2369         uint8_t *bounce;
2370     } mdata;
2371 };
2372 
2373 static void nvme_compare_mdata_cb(void *opaque, int ret)
2374 {
2375     NvmeRequest *req = opaque;
2376     NvmeNamespace *ns = req->ns;
2377     NvmeCtrl *n = nvme_ctrl(req);
2378     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2379     uint16_t ctrl = le16_to_cpu(rw->control);
2380     uint16_t apptag = le16_to_cpu(rw->apptag);
2381     uint16_t appmask = le16_to_cpu(rw->appmask);
2382     uint32_t reftag = le32_to_cpu(rw->reftag);
2383     struct nvme_compare_ctx *ctx = req->opaque;
2384     g_autofree uint8_t *buf = NULL;
2385     BlockBackend *blk = ns->blkconf.blk;
2386     BlockAcctCookie *acct = &req->acct;
2387     BlockAcctStats *stats = blk_get_stats(blk);
2388     uint16_t status = NVME_SUCCESS;
2389 
2390     trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2391 
2392     if (ret) {
2393         block_acct_failed(stats, acct);
2394         nvme_aio_err(req, ret);
2395         goto out;
2396     }
2397 
2398     buf = g_malloc(ctx->mdata.iov.size);
2399 
2400     status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2401                                NVME_TX_DIRECTION_TO_DEVICE, req);
2402     if (status) {
2403         req->status = status;
2404         goto out;
2405     }
2406 
2407     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2408         uint64_t slba = le64_to_cpu(rw->slba);
2409         uint8_t *bufp;
2410         uint8_t *mbufp = ctx->mdata.bounce;
2411         uint8_t *end = mbufp + ctx->mdata.iov.size;
2412         int16_t pil = 0;
2413 
2414         status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2415                                 ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
2416                                 slba, apptag, appmask, reftag);
2417         if (status) {
2418             req->status = status;
2419             goto out;
2420         }
2421 
2422         /*
2423          * When formatted with protection information, do not compare the DIF
2424          * tuple.
2425          */
2426         if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2427             pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
2428         }
2429 
2430         for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2431             if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2432                 req->status = NVME_CMP_FAILURE;
2433                 goto out;
2434             }
2435         }
2436 
2437         goto out;
2438     }
2439 
2440     if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2441         req->status = NVME_CMP_FAILURE;
2442         goto out;
2443     }
2444 
2445     block_acct_done(stats, acct);
2446 
2447 out:
2448     qemu_iovec_destroy(&ctx->data.iov);
2449     g_free(ctx->data.bounce);
2450 
2451     qemu_iovec_destroy(&ctx->mdata.iov);
2452     g_free(ctx->mdata.bounce);
2453 
2454     g_free(ctx);
2455 
2456     nvme_enqueue_req_completion(nvme_cq(req), req);
2457 }
2458 
2459 static void nvme_compare_data_cb(void *opaque, int ret)
2460 {
2461     NvmeRequest *req = opaque;
2462     NvmeCtrl *n = nvme_ctrl(req);
2463     NvmeNamespace *ns = req->ns;
2464     BlockBackend *blk = ns->blkconf.blk;
2465     BlockAcctCookie *acct = &req->acct;
2466     BlockAcctStats *stats = blk_get_stats(blk);
2467 
2468     struct nvme_compare_ctx *ctx = req->opaque;
2469     g_autofree uint8_t *buf = NULL;
2470     uint16_t status;
2471 
2472     trace_pci_nvme_compare_data_cb(nvme_cid(req));
2473 
2474     if (ret) {
2475         block_acct_failed(stats, acct);
2476         nvme_aio_err(req, ret);
2477         goto out;
2478     }
2479 
2480     buf = g_malloc(ctx->data.iov.size);
2481 
2482     status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2483                               NVME_TX_DIRECTION_TO_DEVICE, req);
2484     if (status) {
2485         req->status = status;
2486         goto out;
2487     }
2488 
2489     if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2490         req->status = NVME_CMP_FAILURE;
2491         goto out;
2492     }
2493 
2494     if (ns->lbaf.ms) {
2495         NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2496         uint64_t slba = le64_to_cpu(rw->slba);
2497         uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2498         size_t mlen = nvme_m2b(ns, nlb);
2499         uint64_t offset = nvme_moff(ns, slba);
2500 
2501         ctx->mdata.bounce = g_malloc(mlen);
2502 
2503         qemu_iovec_init(&ctx->mdata.iov, 1);
2504         qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2505 
2506         req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2507                                     nvme_compare_mdata_cb, req);
2508         return;
2509     }
2510 
2511     block_acct_done(stats, acct);
2512 
2513 out:
2514     qemu_iovec_destroy(&ctx->data.iov);
2515     g_free(ctx->data.bounce);
2516     g_free(ctx);
2517 
2518     nvme_enqueue_req_completion(nvme_cq(req), req);
2519 }
2520 
2521 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2522 {
2523     NvmeNamespace *ns = req->ns;
2524     NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2525 
2526     uint32_t attr = le32_to_cpu(dsm->attributes);
2527     uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2528 
2529     uint16_t status = NVME_SUCCESS;
2530 
2531     trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
2532 
2533     if (attr & NVME_DSMGMT_AD) {
2534         int64_t offset;
2535         size_t len;
2536         NvmeDsmRange range[nr];
2537         uintptr_t *discards = (uintptr_t *)&req->opaque;
2538 
2539         status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req);
2540         if (status) {
2541             return status;
2542         }
2543 
2544         /*
2545          * AIO callbacks may be called immediately, so initialize discards to 1
2546          * to make sure the the callback does not complete the request before
2547          * all discards have been issued.
2548          */
2549         *discards = 1;
2550 
2551         for (int i = 0; i < nr; i++) {
2552             uint64_t slba = le64_to_cpu(range[i].slba);
2553             uint32_t nlb = le32_to_cpu(range[i].nlb);
2554 
2555             if (nvme_check_bounds(ns, slba, nlb)) {
2556                 continue;
2557             }
2558 
2559             trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
2560                                           nlb);
2561 
2562             if (nlb > n->dmrsl) {
2563                 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2564             }
2565 
2566             offset = nvme_l2b(ns, slba);
2567             len = nvme_l2b(ns, nlb);
2568 
2569             while (len) {
2570                 size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
2571 
2572                 (*discards)++;
2573 
2574                 blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
2575                                  nvme_aio_discard_cb, req);
2576 
2577                 offset += bytes;
2578                 len -= bytes;
2579             }
2580         }
2581 
2582         /* account for the 1-initialization */
2583         (*discards)--;
2584 
2585         if (*discards) {
2586             status = NVME_NO_COMPLETE;
2587         } else {
2588             status = req->status;
2589         }
2590     }
2591 
2592     return status;
2593 }
2594 
2595 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2596 {
2597     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2598     NvmeNamespace *ns = req->ns;
2599     BlockBackend *blk = ns->blkconf.blk;
2600     uint64_t slba = le64_to_cpu(rw->slba);
2601     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2602     size_t len = nvme_l2b(ns, nlb);
2603     int64_t offset = nvme_l2b(ns, slba);
2604     uint16_t ctrl = le16_to_cpu(rw->control);
2605     uint32_t reftag = le32_to_cpu(rw->reftag);
2606     NvmeBounceContext *ctx = NULL;
2607     uint16_t status;
2608 
2609     trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2610 
2611     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2612         status = nvme_check_prinfo(ns, ctrl, slba, reftag);
2613         if (status) {
2614             return status;
2615         }
2616 
2617         if (ctrl & NVME_RW_PRINFO_PRACT) {
2618             return NVME_INVALID_PROT_INFO | NVME_DNR;
2619         }
2620     }
2621 
2622     if (len > n->page_size << n->params.vsl) {
2623         return NVME_INVALID_FIELD | NVME_DNR;
2624     }
2625 
2626     status = nvme_check_bounds(ns, slba, nlb);
2627     if (status) {
2628         return status;
2629     }
2630 
2631     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2632         status = nvme_check_dulbe(ns, slba, nlb);
2633         if (status) {
2634             return status;
2635         }
2636     }
2637 
2638     ctx = g_new0(NvmeBounceContext, 1);
2639     ctx->req = req;
2640 
2641     ctx->data.bounce = g_malloc(len);
2642 
2643     qemu_iovec_init(&ctx->data.iov, 1);
2644     qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2645 
2646     block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2647                      BLOCK_ACCT_READ);
2648 
2649     req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2650                                 nvme_verify_mdata_in_cb, ctx);
2651     return NVME_NO_COMPLETE;
2652 }
2653 
2654 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2655 {
2656     NvmeNamespace *ns = req->ns;
2657     NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2658 
2659     uint16_t nr = copy->nr + 1;
2660     uint8_t format = copy->control[0] & 0xf;
2661 
2662     /*
2663      * Shift the PRINFOR/PRINFOW values by 10 to allow reusing the
2664      * NVME_RW_PRINFO constants.
2665      */
2666     uint16_t prinfor = ((copy->control[0] >> 4) & 0xf) << 10;
2667     uint16_t prinfow = ((copy->control[2] >> 2) & 0xf) << 10;
2668 
2669     uint32_t nlb = 0;
2670     uint8_t *bounce = NULL, *bouncep = NULL;
2671     uint8_t *mbounce = NULL, *mbouncep = NULL;
2672     struct nvme_copy_ctx *ctx;
2673     uint16_t status;
2674     int i;
2675 
2676     trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2677 
2678     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2679         ((prinfor & NVME_RW_PRINFO_PRACT) != (prinfow & NVME_RW_PRINFO_PRACT))) {
2680         return NVME_INVALID_FIELD | NVME_DNR;
2681     }
2682 
2683     if (!(n->id_ctrl.ocfs & (1 << format))) {
2684         trace_pci_nvme_err_copy_invalid_format(format);
2685         return NVME_INVALID_FIELD | NVME_DNR;
2686     }
2687 
2688     if (nr > ns->id_ns.msrc + 1) {
2689         return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2690     }
2691 
2692     ctx = g_new(struct nvme_copy_ctx, 1);
2693     ctx->ranges = g_new(NvmeCopySourceRange, nr);
2694 
2695     status = nvme_h2c(n, (uint8_t *)ctx->ranges,
2696                       nr * sizeof(NvmeCopySourceRange), req);
2697     if (status) {
2698         goto out;
2699     }
2700 
2701     for (i = 0; i < nr; i++) {
2702         uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
2703         uint32_t _nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;
2704 
2705         if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2706             status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2707             goto out;
2708         }
2709 
2710         status = nvme_check_bounds(ns, slba, _nlb);
2711         if (status) {
2712             goto out;
2713         }
2714 
2715         if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2716             status = nvme_check_dulbe(ns, slba, _nlb);
2717             if (status) {
2718                 goto out;
2719             }
2720         }
2721 
2722         if (ns->params.zoned) {
2723             status = nvme_check_zone_read(ns, slba, _nlb);
2724             if (status) {
2725                 goto out;
2726             }
2727         }
2728 
2729         nlb += _nlb;
2730     }
2731 
2732     if (nlb > le32_to_cpu(ns->id_ns.mcl)) {
2733         status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2734         goto out;
2735     }
2736 
2737     bounce = bouncep = g_malloc(nvme_l2b(ns, nlb));
2738     if (ns->lbaf.ms) {
2739         mbounce = mbouncep = g_malloc(nvme_m2b(ns, nlb));
2740     }
2741 
2742     block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
2743                      BLOCK_ACCT_READ);
2744 
2745     ctx->bounce = bounce;
2746     ctx->mbounce = mbounce;
2747     ctx->nlb = nlb;
2748     ctx->copies = 1;
2749 
2750     req->opaque = ctx;
2751 
2752     for (i = 0; i < nr; i++) {
2753         uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
2754         uint32_t nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;
2755 
2756         size_t len = nvme_l2b(ns, nlb);
2757         int64_t offset = nvme_l2b(ns, slba);
2758 
2759         trace_pci_nvme_copy_source_range(slba, nlb);
2760 
2761         struct nvme_copy_in_ctx *in_ctx = g_new(struct nvme_copy_in_ctx, 1);
2762         in_ctx->req = req;
2763 
2764         qemu_iovec_init(&in_ctx->iov, 1);
2765         qemu_iovec_add(&in_ctx->iov, bouncep, len);
2766 
2767         ctx->copies++;
2768 
2769         blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
2770                        nvme_aio_copy_in_cb, in_ctx);
2771 
2772         bouncep += len;
2773 
2774         if (ns->lbaf.ms) {
2775             len = nvme_m2b(ns, nlb);
2776             offset = nvme_moff(ns, slba);
2777 
2778             in_ctx = g_new(struct nvme_copy_in_ctx, 1);
2779             in_ctx->req = req;
2780 
2781             qemu_iovec_init(&in_ctx->iov, 1);
2782             qemu_iovec_add(&in_ctx->iov, mbouncep, len);
2783 
2784             ctx->copies++;
2785 
2786             blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
2787                            nvme_aio_copy_in_cb, in_ctx);
2788 
2789             mbouncep += len;
2790         }
2791     }
2792 
2793     /* account for the 1-initialization */
2794     ctx->copies--;
2795 
2796     if (!ctx->copies) {
2797         nvme_copy_in_complete(req);
2798     }
2799 
2800     return NVME_NO_COMPLETE;
2801 
2802 out:
2803     g_free(ctx->ranges);
2804     g_free(ctx);
2805 
2806     return status;
2807 }
2808 
2809 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
2810 {
2811     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2812     NvmeNamespace *ns = req->ns;
2813     BlockBackend *blk = ns->blkconf.blk;
2814     uint64_t slba = le64_to_cpu(rw->slba);
2815     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2816     uint16_t ctrl = le16_to_cpu(rw->control);
2817     size_t data_len = nvme_l2b(ns, nlb);
2818     size_t len = data_len;
2819     int64_t offset = nvme_l2b(ns, slba);
2820     struct nvme_compare_ctx *ctx = NULL;
2821     uint16_t status;
2822 
2823     trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2824 
2825     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (ctrl & NVME_RW_PRINFO_PRACT)) {
2826         return NVME_INVALID_PROT_INFO | NVME_DNR;
2827     }
2828 
2829     if (nvme_ns_ext(ns)) {
2830         len += nvme_m2b(ns, nlb);
2831     }
2832 
2833     status = nvme_check_mdts(n, len);
2834     if (status) {
2835         return status;
2836     }
2837 
2838     status = nvme_check_bounds(ns, slba, nlb);
2839     if (status) {
2840         return status;
2841     }
2842 
2843     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2844         status = nvme_check_dulbe(ns, slba, nlb);
2845         if (status) {
2846             return status;
2847         }
2848     }
2849 
2850     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
2851     if (status) {
2852         return status;
2853     }
2854 
2855     ctx = g_new(struct nvme_compare_ctx, 1);
2856     ctx->data.bounce = g_malloc(data_len);
2857 
2858     req->opaque = ctx;
2859 
2860     qemu_iovec_init(&ctx->data.iov, 1);
2861     qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
2862 
2863     block_acct_start(blk_get_stats(blk), &req->acct, data_len,
2864                      BLOCK_ACCT_READ);
2865     req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
2866                                 nvme_compare_data_cb, req);
2867 
2868     return NVME_NO_COMPLETE;
2869 }
2870 
2871 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
2872 {
2873     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
2874     uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
2875     uint16_t status;
2876     struct nvme_aio_flush_ctx *ctx;
2877     NvmeNamespace *ns;
2878 
2879     trace_pci_nvme_flush(nvme_cid(req), nsid);
2880 
2881     if (nsid != NVME_NSID_BROADCAST) {
2882         req->ns = nvme_ns(n, nsid);
2883         if (unlikely(!req->ns)) {
2884             return NVME_INVALID_FIELD | NVME_DNR;
2885         }
2886 
2887         block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
2888                          BLOCK_ACCT_FLUSH);
2889         req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_misc_cb, req);
2890         return NVME_NO_COMPLETE;
2891     }
2892 
2893     /* 1-initialize; see comment in nvme_dsm */
2894     *num_flushes = 1;
2895 
2896     for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
2897         ns = nvme_ns(n, i);
2898         if (!ns) {
2899             continue;
2900         }
2901 
2902         ctx = g_new(struct nvme_aio_flush_ctx, 1);
2903         ctx->req = req;
2904         ctx->ns = ns;
2905 
2906         (*num_flushes)++;
2907 
2908         block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0,
2909                          BLOCK_ACCT_FLUSH);
2910         blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx);
2911     }
2912 
2913     /* account for the 1-initialization */
2914     (*num_flushes)--;
2915 
2916     if (*num_flushes) {
2917         status = NVME_NO_COMPLETE;
2918     } else {
2919         status = req->status;
2920     }
2921 
2922     return status;
2923 }
2924 
2925 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
2926 {
2927     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2928     NvmeNamespace *ns = req->ns;
2929     uint64_t slba = le64_to_cpu(rw->slba);
2930     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2931     uint16_t ctrl = le16_to_cpu(rw->control);
2932     uint64_t data_size = nvme_l2b(ns, nlb);
2933     uint64_t mapped_size = data_size;
2934     uint64_t data_offset;
2935     BlockBackend *blk = ns->blkconf.blk;
2936     uint16_t status;
2937 
2938     if (nvme_ns_ext(ns)) {
2939         mapped_size += nvme_m2b(ns, nlb);
2940 
2941         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2942             bool pract = ctrl & NVME_RW_PRINFO_PRACT;
2943 
2944             if (pract && ns->lbaf.ms == 8) {
2945                 mapped_size = data_size;
2946             }
2947         }
2948     }
2949 
2950     trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
2951 
2952     status = nvme_check_mdts(n, mapped_size);
2953     if (status) {
2954         goto invalid;
2955     }
2956 
2957     status = nvme_check_bounds(ns, slba, nlb);
2958     if (status) {
2959         goto invalid;
2960     }
2961 
2962     if (ns->params.zoned) {
2963         status = nvme_check_zone_read(ns, slba, nlb);
2964         if (status) {
2965             trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
2966             goto invalid;
2967         }
2968     }
2969 
2970     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2971         status = nvme_check_dulbe(ns, slba, nlb);
2972         if (status) {
2973             goto invalid;
2974         }
2975     }
2976 
2977     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2978         return nvme_dif_rw(n, req);
2979     }
2980 
2981     status = nvme_map_data(n, nlb, req);
2982     if (status) {
2983         goto invalid;
2984     }
2985 
2986     data_offset = nvme_l2b(ns, slba);
2987 
2988     block_acct_start(blk_get_stats(blk), &req->acct, data_size,
2989                      BLOCK_ACCT_READ);
2990     nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
2991     return NVME_NO_COMPLETE;
2992 
2993 invalid:
2994     block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
2995     return status | NVME_DNR;
2996 }
2997 
2998 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
2999                               bool wrz)
3000 {
3001     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3002     NvmeNamespace *ns = req->ns;
3003     uint64_t slba = le64_to_cpu(rw->slba);
3004     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3005     uint16_t ctrl = le16_to_cpu(rw->control);
3006     uint64_t data_size = nvme_l2b(ns, nlb);
3007     uint64_t mapped_size = data_size;
3008     uint64_t data_offset;
3009     NvmeZone *zone;
3010     NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3011     BlockBackend *blk = ns->blkconf.blk;
3012     uint16_t status;
3013 
3014     if (nvme_ns_ext(ns)) {
3015         mapped_size += nvme_m2b(ns, nlb);
3016 
3017         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3018             bool pract = ctrl & NVME_RW_PRINFO_PRACT;
3019 
3020             if (pract && ns->lbaf.ms == 8) {
3021                 mapped_size -= nvme_m2b(ns, nlb);
3022             }
3023         }
3024     }
3025 
3026     trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3027                          nvme_nsid(ns), nlb, mapped_size, slba);
3028 
3029     if (!wrz) {
3030         status = nvme_check_mdts(n, mapped_size);
3031         if (status) {
3032             goto invalid;
3033         }
3034     }
3035 
3036     status = nvme_check_bounds(ns, slba, nlb);
3037     if (status) {
3038         goto invalid;
3039     }
3040 
3041     if (ns->params.zoned) {
3042         zone = nvme_get_zone_by_slba(ns, slba);
3043 
3044         if (append) {
3045             bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3046 
3047             if (unlikely(slba != zone->d.zslba)) {
3048                 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3049                 status = NVME_INVALID_FIELD;
3050                 goto invalid;
3051             }
3052 
3053             if (n->params.zasl &&
3054                 data_size > (uint64_t)n->page_size << n->params.zasl) {
3055                 trace_pci_nvme_err_zasl(data_size);
3056                 return NVME_INVALID_FIELD | NVME_DNR;
3057             }
3058 
3059             slba = zone->w_ptr;
3060             rw->slba = cpu_to_le64(slba);
3061             res->slba = cpu_to_le64(slba);
3062 
3063             switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3064             case NVME_ID_NS_DPS_TYPE_1:
3065                 if (!piremap) {
3066                     return NVME_INVALID_PROT_INFO | NVME_DNR;
3067                 }
3068 
3069                 /* fallthrough */
3070 
3071             case NVME_ID_NS_DPS_TYPE_2:
3072                 if (piremap) {
3073                     uint32_t reftag = le32_to_cpu(rw->reftag);
3074                     rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3075                 }
3076 
3077                 break;
3078 
3079             case NVME_ID_NS_DPS_TYPE_3:
3080                 if (piremap) {
3081                     return NVME_INVALID_PROT_INFO | NVME_DNR;
3082                 }
3083 
3084                 break;
3085             }
3086         }
3087 
3088         status = nvme_check_zone_write(ns, zone, slba, nlb);
3089         if (status) {
3090             goto invalid;
3091         }
3092 
3093         status = nvme_zrm_auto(n, ns, zone);
3094         if (status) {
3095             goto invalid;
3096         }
3097 
3098         zone->w_ptr += nlb;
3099     }
3100 
3101     data_offset = nvme_l2b(ns, slba);
3102 
3103     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3104         return nvme_dif_rw(n, req);
3105     }
3106 
3107     if (!wrz) {
3108         status = nvme_map_data(n, nlb, req);
3109         if (status) {
3110             goto invalid;
3111         }
3112 
3113         block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3114                          BLOCK_ACCT_WRITE);
3115         nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3116     } else {
3117         req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3118                                            BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3119                                            req);
3120     }
3121 
3122     return NVME_NO_COMPLETE;
3123 
3124 invalid:
3125     block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3126     return status | NVME_DNR;
3127 }
3128 
3129 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3130 {
3131     return nvme_do_write(n, req, false, false);
3132 }
3133 
3134 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3135 {
3136     return nvme_do_write(n, req, false, true);
3137 }
3138 
3139 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3140 {
3141     return nvme_do_write(n, req, true, false);
3142 }
3143 
3144 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3145                                             uint64_t *slba, uint32_t *zone_idx)
3146 {
3147     uint32_t dw10 = le32_to_cpu(c->cdw10);
3148     uint32_t dw11 = le32_to_cpu(c->cdw11);
3149 
3150     if (!ns->params.zoned) {
3151         trace_pci_nvme_err_invalid_opc(c->opcode);
3152         return NVME_INVALID_OPCODE | NVME_DNR;
3153     }
3154 
3155     *slba = ((uint64_t)dw11) << 32 | dw10;
3156     if (unlikely(*slba >= ns->id_ns.nsze)) {
3157         trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3158         *slba = 0;
3159         return NVME_LBA_RANGE | NVME_DNR;
3160     }
3161 
3162     *zone_idx = nvme_zone_idx(ns, *slba);
3163     assert(*zone_idx < ns->num_zones);
3164 
3165     return NVME_SUCCESS;
3166 }
3167 
3168 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3169                                  NvmeRequest *);
3170 
3171 enum NvmeZoneProcessingMask {
3172     NVME_PROC_CURRENT_ZONE    = 0,
3173     NVME_PROC_OPENED_ZONES    = 1 << 0,
3174     NVME_PROC_CLOSED_ZONES    = 1 << 1,
3175     NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3176     NVME_PROC_FULL_ZONES      = 1 << 3,
3177 };
3178 
3179 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3180                                NvmeZoneState state, NvmeRequest *req)
3181 {
3182     return nvme_zrm_open(nvme_ctrl(req), ns, zone);
3183 }
3184 
3185 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3186                                 NvmeZoneState state, NvmeRequest *req)
3187 {
3188     return nvme_zrm_close(ns, zone);
3189 }
3190 
3191 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3192                                  NvmeZoneState state, NvmeRequest *req)
3193 {
3194     return nvme_zrm_finish(ns, zone);
3195 }
3196 
3197 static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
3198                                 NvmeZoneState state, NvmeRequest *req)
3199 {
3200     uintptr_t *resets = (uintptr_t *)&req->opaque;
3201     struct nvme_zone_reset_ctx *ctx;
3202 
3203     switch (state) {
3204     case NVME_ZONE_STATE_EMPTY:
3205         return NVME_SUCCESS;
3206     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3207     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3208     case NVME_ZONE_STATE_CLOSED:
3209     case NVME_ZONE_STATE_FULL:
3210         break;
3211     default:
3212         return NVME_ZONE_INVAL_TRANSITION;
3213     }
3214 
3215     /*
3216      * The zone reset aio callback needs to know the zone that is being reset
3217      * in order to transition the zone on completion.
3218      */
3219     ctx = g_new(struct nvme_zone_reset_ctx, 1);
3220     ctx->req = req;
3221     ctx->zone = zone;
3222 
3223     (*resets)++;
3224 
3225     blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba),
3226                           nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
3227                           nvme_aio_zone_reset_cb, ctx);
3228 
3229     return NVME_NO_COMPLETE;
3230 }
3231 
3232 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3233                                   NvmeZoneState state, NvmeRequest *req)
3234 {
3235     switch (state) {
3236     case NVME_ZONE_STATE_READ_ONLY:
3237         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3238         /* fall through */
3239     case NVME_ZONE_STATE_OFFLINE:
3240         return NVME_SUCCESS;
3241     default:
3242         return NVME_ZONE_INVAL_TRANSITION;
3243     }
3244 }
3245 
3246 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3247 {
3248     uint16_t status;
3249     uint8_t state = nvme_get_zone_state(zone);
3250 
3251     if (state == NVME_ZONE_STATE_EMPTY) {
3252         status = nvme_aor_check(ns, 1, 0);
3253         if (status) {
3254             return status;
3255         }
3256         nvme_aor_inc_active(ns);
3257         zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3258         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3259         return NVME_SUCCESS;
3260     }
3261 
3262     return NVME_ZONE_INVAL_TRANSITION;
3263 }
3264 
3265 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3266                                     enum NvmeZoneProcessingMask proc_mask,
3267                                     op_handler_t op_hndlr, NvmeRequest *req)
3268 {
3269     uint16_t status = NVME_SUCCESS;
3270     NvmeZoneState zs = nvme_get_zone_state(zone);
3271     bool proc_zone;
3272 
3273     switch (zs) {
3274     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3275     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3276         proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3277         break;
3278     case NVME_ZONE_STATE_CLOSED:
3279         proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3280         break;
3281     case NVME_ZONE_STATE_READ_ONLY:
3282         proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3283         break;
3284     case NVME_ZONE_STATE_FULL:
3285         proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3286         break;
3287     default:
3288         proc_zone = false;
3289     }
3290 
3291     if (proc_zone) {
3292         status = op_hndlr(ns, zone, zs, req);
3293     }
3294 
3295     return status;
3296 }
3297 
3298 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3299                                 enum NvmeZoneProcessingMask proc_mask,
3300                                 op_handler_t op_hndlr, NvmeRequest *req)
3301 {
3302     NvmeZone *next;
3303     uint16_t status = NVME_SUCCESS;
3304     int i;
3305 
3306     if (!proc_mask) {
3307         status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3308     } else {
3309         if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3310             QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3311                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3312                                              req);
3313                 if (status && status != NVME_NO_COMPLETE) {
3314                     goto out;
3315                 }
3316             }
3317         }
3318         if (proc_mask & NVME_PROC_OPENED_ZONES) {
3319             QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3320                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3321                                              req);
3322                 if (status && status != NVME_NO_COMPLETE) {
3323                     goto out;
3324                 }
3325             }
3326 
3327             QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3328                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3329                                              req);
3330                 if (status && status != NVME_NO_COMPLETE) {
3331                     goto out;
3332                 }
3333             }
3334         }
3335         if (proc_mask & NVME_PROC_FULL_ZONES) {
3336             QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3337                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3338                                              req);
3339                 if (status && status != NVME_NO_COMPLETE) {
3340                     goto out;
3341                 }
3342             }
3343         }
3344 
3345         if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3346             for (i = 0; i < ns->num_zones; i++, zone++) {
3347                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3348                                              req);
3349                 if (status && status != NVME_NO_COMPLETE) {
3350                     goto out;
3351                 }
3352             }
3353         }
3354     }
3355 
3356 out:
3357     return status;
3358 }
3359 
3360 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3361 {
3362     NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3363     NvmeNamespace *ns = req->ns;
3364     NvmeZone *zone;
3365     uintptr_t *resets;
3366     uint8_t *zd_ext;
3367     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3368     uint64_t slba = 0;
3369     uint32_t zone_idx = 0;
3370     uint16_t status;
3371     uint8_t action;
3372     bool all;
3373     enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3374 
3375     action = dw13 & 0xff;
3376     all = dw13 & 0x100;
3377 
3378     req->status = NVME_SUCCESS;
3379 
3380     if (!all) {
3381         status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3382         if (status) {
3383             return status;
3384         }
3385     }
3386 
3387     zone = &ns->zone_array[zone_idx];
3388     if (slba != zone->d.zslba) {
3389         trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3390         return NVME_INVALID_FIELD | NVME_DNR;
3391     }
3392 
3393     switch (action) {
3394 
3395     case NVME_ZONE_ACTION_OPEN:
3396         if (all) {
3397             proc_mask = NVME_PROC_CLOSED_ZONES;
3398         }
3399         trace_pci_nvme_open_zone(slba, zone_idx, all);
3400         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3401         break;
3402 
3403     case NVME_ZONE_ACTION_CLOSE:
3404         if (all) {
3405             proc_mask = NVME_PROC_OPENED_ZONES;
3406         }
3407         trace_pci_nvme_close_zone(slba, zone_idx, all);
3408         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3409         break;
3410 
3411     case NVME_ZONE_ACTION_FINISH:
3412         if (all) {
3413             proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3414         }
3415         trace_pci_nvme_finish_zone(slba, zone_idx, all);
3416         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3417         break;
3418 
3419     case NVME_ZONE_ACTION_RESET:
3420         resets = (uintptr_t *)&req->opaque;
3421 
3422         if (all) {
3423             proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES |
3424                 NVME_PROC_FULL_ZONES;
3425         }
3426         trace_pci_nvme_reset_zone(slba, zone_idx, all);
3427 
3428         *resets = 1;
3429 
3430         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req);
3431 
3432         (*resets)--;
3433 
3434         return *resets ? NVME_NO_COMPLETE : req->status;
3435 
3436     case NVME_ZONE_ACTION_OFFLINE:
3437         if (all) {
3438             proc_mask = NVME_PROC_READ_ONLY_ZONES;
3439         }
3440         trace_pci_nvme_offline_zone(slba, zone_idx, all);
3441         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3442         break;
3443 
3444     case NVME_ZONE_ACTION_SET_ZD_EXT:
3445         trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3446         if (all || !ns->params.zd_extension_size) {
3447             return NVME_INVALID_FIELD | NVME_DNR;
3448         }
3449         zd_ext = nvme_get_zd_extension(ns, zone_idx);
3450         status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3451         if (status) {
3452             trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3453             return status;
3454         }
3455 
3456         status = nvme_set_zd_ext(ns, zone);
3457         if (status == NVME_SUCCESS) {
3458             trace_pci_nvme_zd_extension_set(zone_idx);
3459             return status;
3460         }
3461         break;
3462 
3463     default:
3464         trace_pci_nvme_err_invalid_mgmt_action(action);
3465         status = NVME_INVALID_FIELD;
3466     }
3467 
3468     if (status == NVME_ZONE_INVAL_TRANSITION) {
3469         trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
3470                                                          zone->d.za);
3471     }
3472     if (status) {
3473         status |= NVME_DNR;
3474     }
3475 
3476     return status;
3477 }
3478 
3479 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
3480 {
3481     NvmeZoneState zs = nvme_get_zone_state(zl);
3482 
3483     switch (zafs) {
3484     case NVME_ZONE_REPORT_ALL:
3485         return true;
3486     case NVME_ZONE_REPORT_EMPTY:
3487         return zs == NVME_ZONE_STATE_EMPTY;
3488     case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
3489         return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
3490     case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
3491         return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
3492     case NVME_ZONE_REPORT_CLOSED:
3493         return zs == NVME_ZONE_STATE_CLOSED;
3494     case NVME_ZONE_REPORT_FULL:
3495         return zs == NVME_ZONE_STATE_FULL;
3496     case NVME_ZONE_REPORT_READ_ONLY:
3497         return zs == NVME_ZONE_STATE_READ_ONLY;
3498     case NVME_ZONE_REPORT_OFFLINE:
3499         return zs == NVME_ZONE_STATE_OFFLINE;
3500     default:
3501         return false;
3502     }
3503 }
3504 
3505 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
3506 {
3507     NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3508     NvmeNamespace *ns = req->ns;
3509     /* cdw12 is zero-based number of dwords to return. Convert to bytes */
3510     uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
3511     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3512     uint32_t zone_idx, zra, zrasf, partial;
3513     uint64_t max_zones, nr_zones = 0;
3514     uint16_t status;
3515     uint64_t slba;
3516     NvmeZoneDescr *z;
3517     NvmeZone *zone;
3518     NvmeZoneReportHeader *header;
3519     void *buf, *buf_p;
3520     size_t zone_entry_sz;
3521     int i;
3522 
3523     req->status = NVME_SUCCESS;
3524 
3525     status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3526     if (status) {
3527         return status;
3528     }
3529 
3530     zra = dw13 & 0xff;
3531     if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
3532         return NVME_INVALID_FIELD | NVME_DNR;
3533     }
3534     if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
3535         return NVME_INVALID_FIELD | NVME_DNR;
3536     }
3537 
3538     zrasf = (dw13 >> 8) & 0xff;
3539     if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
3540         return NVME_INVALID_FIELD | NVME_DNR;
3541     }
3542 
3543     if (data_size < sizeof(NvmeZoneReportHeader)) {
3544         return NVME_INVALID_FIELD | NVME_DNR;
3545     }
3546 
3547     status = nvme_check_mdts(n, data_size);
3548     if (status) {
3549         return status;
3550     }
3551 
3552     partial = (dw13 >> 16) & 0x01;
3553 
3554     zone_entry_sz = sizeof(NvmeZoneDescr);
3555     if (zra == NVME_ZONE_REPORT_EXTENDED) {
3556         zone_entry_sz += ns->params.zd_extension_size;
3557     }
3558 
3559     max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
3560     buf = g_malloc0(data_size);
3561 
3562     zone = &ns->zone_array[zone_idx];
3563     for (i = zone_idx; i < ns->num_zones; i++) {
3564         if (partial && nr_zones >= max_zones) {
3565             break;
3566         }
3567         if (nvme_zone_matches_filter(zrasf, zone++)) {
3568             nr_zones++;
3569         }
3570     }
3571     header = (NvmeZoneReportHeader *)buf;
3572     header->nr_zones = cpu_to_le64(nr_zones);
3573 
3574     buf_p = buf + sizeof(NvmeZoneReportHeader);
3575     for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
3576         zone = &ns->zone_array[zone_idx];
3577         if (nvme_zone_matches_filter(zrasf, zone)) {
3578             z = (NvmeZoneDescr *)buf_p;
3579             buf_p += sizeof(NvmeZoneDescr);
3580 
3581             z->zt = zone->d.zt;
3582             z->zs = zone->d.zs;
3583             z->zcap = cpu_to_le64(zone->d.zcap);
3584             z->zslba = cpu_to_le64(zone->d.zslba);
3585             z->za = zone->d.za;
3586 
3587             if (nvme_wp_is_valid(zone)) {
3588                 z->wp = cpu_to_le64(zone->d.wp);
3589             } else {
3590                 z->wp = cpu_to_le64(~0ULL);
3591             }
3592 
3593             if (zra == NVME_ZONE_REPORT_EXTENDED) {
3594                 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
3595                     memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
3596                            ns->params.zd_extension_size);
3597                 }
3598                 buf_p += ns->params.zd_extension_size;
3599             }
3600 
3601             max_zones--;
3602         }
3603     }
3604 
3605     status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
3606 
3607     g_free(buf);
3608 
3609     return status;
3610 }
3611 
3612 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
3613 {
3614     NvmeNamespace *ns;
3615     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3616 
3617     trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
3618                           req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
3619 
3620     if (!nvme_nsid_valid(n, nsid)) {
3621         return NVME_INVALID_NSID | NVME_DNR;
3622     }
3623 
3624     /*
3625      * In the base NVM command set, Flush may apply to all namespaces
3626      * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
3627      * along with TP 4056 (Namespace Types), it may be pretty screwed up.
3628      *
3629      * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
3630      * opcode with a specific command since we cannot determine a unique I/O
3631      * command set. Opcode 0h could have any other meaning than something
3632      * equivalent to flushing and say it DOES have completely different
3633      * semantics in some other command set - does an NSID of FFFFFFFFh then
3634      * mean "for all namespaces, apply whatever command set specific command
3635      * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
3636      * whatever command that uses the 0h opcode if, and only if, it allows NSID
3637      * to be FFFFFFFFh"?
3638      *
3639      * Anyway (and luckily), for now, we do not care about this since the
3640      * device only supports namespace types that includes the NVM Flush command
3641      * (NVM and Zoned), so always do an NVM Flush.
3642      */
3643     if (req->cmd.opcode == NVME_CMD_FLUSH) {
3644         return nvme_flush(n, req);
3645     }
3646 
3647     ns = nvme_ns(n, nsid);
3648     if (unlikely(!ns)) {
3649         return NVME_INVALID_FIELD | NVME_DNR;
3650     }
3651 
3652     if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
3653         trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
3654         return NVME_INVALID_OPCODE | NVME_DNR;
3655     }
3656 
3657     if (ns->status) {
3658         return ns->status;
3659     }
3660 
3661     req->ns = ns;
3662 
3663     switch (req->cmd.opcode) {
3664     case NVME_CMD_WRITE_ZEROES:
3665         return nvme_write_zeroes(n, req);
3666     case NVME_CMD_ZONE_APPEND:
3667         return nvme_zone_append(n, req);
3668     case NVME_CMD_WRITE:
3669         return nvme_write(n, req);
3670     case NVME_CMD_READ:
3671         return nvme_read(n, req);
3672     case NVME_CMD_COMPARE:
3673         return nvme_compare(n, req);
3674     case NVME_CMD_DSM:
3675         return nvme_dsm(n, req);
3676     case NVME_CMD_VERIFY:
3677         return nvme_verify(n, req);
3678     case NVME_CMD_COPY:
3679         return nvme_copy(n, req);
3680     case NVME_CMD_ZONE_MGMT_SEND:
3681         return nvme_zone_mgmt_send(n, req);
3682     case NVME_CMD_ZONE_MGMT_RECV:
3683         return nvme_zone_mgmt_recv(n, req);
3684     default:
3685         assert(false);
3686     }
3687 
3688     return NVME_INVALID_OPCODE | NVME_DNR;
3689 }
3690 
3691 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
3692 {
3693     n->sq[sq->sqid] = NULL;
3694     timer_free(sq->timer);
3695     g_free(sq->io_req);
3696     if (sq->sqid) {
3697         g_free(sq);
3698     }
3699 }
3700 
3701 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
3702 {
3703     NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
3704     NvmeRequest *r, *next;
3705     NvmeSQueue *sq;
3706     NvmeCQueue *cq;
3707     uint16_t qid = le16_to_cpu(c->qid);
3708     uint32_t nsid;
3709 
3710     if (unlikely(!qid || nvme_check_sqid(n, qid))) {
3711         trace_pci_nvme_err_invalid_del_sq(qid);
3712         return NVME_INVALID_QID | NVME_DNR;
3713     }
3714 
3715     trace_pci_nvme_del_sq(qid);
3716 
3717     sq = n->sq[qid];
3718     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
3719         r = QTAILQ_FIRST(&sq->out_req_list);
3720         if (r->aiocb) {
3721             blk_aio_cancel(r->aiocb);
3722         }
3723     }
3724 
3725     /*
3726      * Drain all namespaces if there are still outstanding requests that we
3727      * could not cancel explicitly.
3728      */
3729     if (!QTAILQ_EMPTY(&sq->out_req_list)) {
3730         for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
3731             NvmeNamespace *ns = nvme_ns(n, nsid);
3732             if (ns) {
3733                 nvme_ns_drain(ns);
3734             }
3735         }
3736     }
3737 
3738     assert(QTAILQ_EMPTY(&sq->out_req_list));
3739 
3740     if (!nvme_check_cqid(n, sq->cqid)) {
3741         cq = n->cq[sq->cqid];
3742         QTAILQ_REMOVE(&cq->sq_list, sq, entry);
3743 
3744         nvme_post_cqes(cq);
3745         QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
3746             if (r->sq == sq) {
3747                 QTAILQ_REMOVE(&cq->req_list, r, entry);
3748                 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
3749             }
3750         }
3751     }
3752 
3753     nvme_free_sq(sq, n);
3754     return NVME_SUCCESS;
3755 }
3756 
3757 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
3758                          uint16_t sqid, uint16_t cqid, uint16_t size)
3759 {
3760     int i;
3761     NvmeCQueue *cq;
3762 
3763     sq->ctrl = n;
3764     sq->dma_addr = dma_addr;
3765     sq->sqid = sqid;
3766     sq->size = size;
3767     sq->cqid = cqid;
3768     sq->head = sq->tail = 0;
3769     sq->io_req = g_new0(NvmeRequest, sq->size);
3770 
3771     QTAILQ_INIT(&sq->req_list);
3772     QTAILQ_INIT(&sq->out_req_list);
3773     for (i = 0; i < sq->size; i++) {
3774         sq->io_req[i].sq = sq;
3775         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
3776     }
3777     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
3778 
3779     assert(n->cq[cqid]);
3780     cq = n->cq[cqid];
3781     QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
3782     n->sq[sqid] = sq;
3783 }
3784 
3785 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
3786 {
3787     NvmeSQueue *sq;
3788     NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
3789 
3790     uint16_t cqid = le16_to_cpu(c->cqid);
3791     uint16_t sqid = le16_to_cpu(c->sqid);
3792     uint16_t qsize = le16_to_cpu(c->qsize);
3793     uint16_t qflags = le16_to_cpu(c->sq_flags);
3794     uint64_t prp1 = le64_to_cpu(c->prp1);
3795 
3796     trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
3797 
3798     if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
3799         trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
3800         return NVME_INVALID_CQID | NVME_DNR;
3801     }
3802     if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
3803         n->sq[sqid] != NULL)) {
3804         trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
3805         return NVME_INVALID_QID | NVME_DNR;
3806     }
3807     if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
3808         trace_pci_nvme_err_invalid_create_sq_size(qsize);
3809         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
3810     }
3811     if (unlikely(prp1 & (n->page_size - 1))) {
3812         trace_pci_nvme_err_invalid_create_sq_addr(prp1);
3813         return NVME_INVALID_PRP_OFFSET | NVME_DNR;
3814     }
3815     if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
3816         trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
3817         return NVME_INVALID_FIELD | NVME_DNR;
3818     }
3819     sq = g_malloc0(sizeof(*sq));
3820     nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
3821     return NVME_SUCCESS;
3822 }
3823 
3824 struct nvme_stats {
3825     uint64_t units_read;
3826     uint64_t units_written;
3827     uint64_t read_commands;
3828     uint64_t write_commands;
3829 };
3830 
3831 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
3832 {
3833     BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
3834 
3835     stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
3836     stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
3837     stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
3838     stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
3839 }
3840 
3841 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
3842                                 uint64_t off, NvmeRequest *req)
3843 {
3844     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3845     struct nvme_stats stats = { 0 };
3846     NvmeSmartLog smart = { 0 };
3847     uint32_t trans_len;
3848     NvmeNamespace *ns;
3849     time_t current_ms;
3850 
3851     if (off >= sizeof(smart)) {
3852         return NVME_INVALID_FIELD | NVME_DNR;
3853     }
3854 
3855     if (nsid != 0xffffffff) {
3856         ns = nvme_ns(n, nsid);
3857         if (!ns) {
3858             return NVME_INVALID_NSID | NVME_DNR;
3859         }
3860         nvme_set_blk_stats(ns, &stats);
3861     } else {
3862         int i;
3863 
3864         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
3865             ns = nvme_ns(n, i);
3866             if (!ns) {
3867                 continue;
3868             }
3869             nvme_set_blk_stats(ns, &stats);
3870         }
3871     }
3872 
3873     trans_len = MIN(sizeof(smart) - off, buf_len);
3874     smart.critical_warning = n->smart_critical_warning;
3875 
3876     smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
3877                                                         1000));
3878     smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
3879                                                            1000));
3880     smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
3881     smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
3882 
3883     smart.temperature = cpu_to_le16(n->temperature);
3884 
3885     if ((n->temperature >= n->features.temp_thresh_hi) ||
3886         (n->temperature <= n->features.temp_thresh_low)) {
3887         smart.critical_warning |= NVME_SMART_TEMPERATURE;
3888     }
3889 
3890     current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
3891     smart.power_on_hours[0] =
3892         cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
3893 
3894     if (!rae) {
3895         nvme_clear_events(n, NVME_AER_TYPE_SMART);
3896     }
3897 
3898     return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
3899 }
3900 
3901 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
3902                                  NvmeRequest *req)
3903 {
3904     uint32_t trans_len;
3905     NvmeFwSlotInfoLog fw_log = {
3906         .afi = 0x1,
3907     };
3908 
3909     if (off >= sizeof(fw_log)) {
3910         return NVME_INVALID_FIELD | NVME_DNR;
3911     }
3912 
3913     strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
3914     trans_len = MIN(sizeof(fw_log) - off, buf_len);
3915 
3916     return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
3917 }
3918 
3919 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
3920                                 uint64_t off, NvmeRequest *req)
3921 {
3922     uint32_t trans_len;
3923     NvmeErrorLog errlog;
3924 
3925     if (off >= sizeof(errlog)) {
3926         return NVME_INVALID_FIELD | NVME_DNR;
3927     }
3928 
3929     if (!rae) {
3930         nvme_clear_events(n, NVME_AER_TYPE_ERROR);
3931     }
3932 
3933     memset(&errlog, 0x0, sizeof(errlog));
3934     trans_len = MIN(sizeof(errlog) - off, buf_len);
3935 
3936     return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
3937 }
3938 
3939 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
3940                                     uint64_t off, NvmeRequest *req)
3941 {
3942     uint32_t nslist[1024];
3943     uint32_t trans_len;
3944     int i = 0;
3945     uint32_t nsid;
3946 
3947     memset(nslist, 0x0, sizeof(nslist));
3948     trans_len = MIN(sizeof(nslist) - off, buf_len);
3949 
3950     while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
3951             NVME_CHANGED_NSID_SIZE) {
3952         /*
3953          * If more than 1024 namespaces, the first entry in the log page should
3954          * be set to FFFFFFFFh and the others to 0 as spec.
3955          */
3956         if (i == ARRAY_SIZE(nslist)) {
3957             memset(nslist, 0x0, sizeof(nslist));
3958             nslist[0] = 0xffffffff;
3959             break;
3960         }
3961 
3962         nslist[i++] = nsid;
3963         clear_bit(nsid, n->changed_nsids);
3964     }
3965 
3966     /*
3967      * Remove all the remaining list entries in case returns directly due to
3968      * more than 1024 namespaces.
3969      */
3970     if (nslist[0] == 0xffffffff) {
3971         bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
3972     }
3973 
3974     if (!rae) {
3975         nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
3976     }
3977 
3978     return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
3979 }
3980 
3981 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
3982                                  uint64_t off, NvmeRequest *req)
3983 {
3984     NvmeEffectsLog log = {};
3985     const uint32_t *src_iocs = NULL;
3986     uint32_t trans_len;
3987 
3988     if (off >= sizeof(log)) {
3989         trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
3990         return NVME_INVALID_FIELD | NVME_DNR;
3991     }
3992 
3993     switch (NVME_CC_CSS(n->bar.cc)) {
3994     case NVME_CC_CSS_NVM:
3995         src_iocs = nvme_cse_iocs_nvm;
3996         /* fall through */
3997     case NVME_CC_CSS_ADMIN_ONLY:
3998         break;
3999     case NVME_CC_CSS_CSI:
4000         switch (csi) {
4001         case NVME_CSI_NVM:
4002             src_iocs = nvme_cse_iocs_nvm;
4003             break;
4004         case NVME_CSI_ZONED:
4005             src_iocs = nvme_cse_iocs_zoned;
4006             break;
4007         }
4008     }
4009 
4010     memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4011 
4012     if (src_iocs) {
4013         memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4014     }
4015 
4016     trans_len = MIN(sizeof(log) - off, buf_len);
4017 
4018     return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4019 }
4020 
4021 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4022 {
4023     NvmeCmd *cmd = &req->cmd;
4024 
4025     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4026     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4027     uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4028     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4029     uint8_t  lid = dw10 & 0xff;
4030     uint8_t  lsp = (dw10 >> 8) & 0xf;
4031     uint8_t  rae = (dw10 >> 15) & 0x1;
4032     uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
4033     uint32_t numdl, numdu;
4034     uint64_t off, lpol, lpou;
4035     size_t   len;
4036     uint16_t status;
4037 
4038     numdl = (dw10 >> 16);
4039     numdu = (dw11 & 0xffff);
4040     lpol = dw12;
4041     lpou = dw13;
4042 
4043     len = (((numdu << 16) | numdl) + 1) << 2;
4044     off = (lpou << 32ULL) | lpol;
4045 
4046     if (off & 0x3) {
4047         return NVME_INVALID_FIELD | NVME_DNR;
4048     }
4049 
4050     trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4051 
4052     status = nvme_check_mdts(n, len);
4053     if (status) {
4054         return status;
4055     }
4056 
4057     switch (lid) {
4058     case NVME_LOG_ERROR_INFO:
4059         return nvme_error_info(n, rae, len, off, req);
4060     case NVME_LOG_SMART_INFO:
4061         return nvme_smart_info(n, rae, len, off, req);
4062     case NVME_LOG_FW_SLOT_INFO:
4063         return nvme_fw_log_info(n, len, off, req);
4064     case NVME_LOG_CHANGED_NSLIST:
4065         return nvme_changed_nslist(n, rae, len, off, req);
4066     case NVME_LOG_CMD_EFFECTS:
4067         return nvme_cmd_effects(n, csi, len, off, req);
4068     default:
4069         trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4070         return NVME_INVALID_FIELD | NVME_DNR;
4071     }
4072 }
4073 
4074 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4075 {
4076     n->cq[cq->cqid] = NULL;
4077     timer_free(cq->timer);
4078     if (msix_enabled(&n->parent_obj)) {
4079         msix_vector_unuse(&n->parent_obj, cq->vector);
4080     }
4081     if (cq->cqid) {
4082         g_free(cq);
4083     }
4084 }
4085 
4086 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4087 {
4088     NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4089     NvmeCQueue *cq;
4090     uint16_t qid = le16_to_cpu(c->qid);
4091 
4092     if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4093         trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4094         return NVME_INVALID_CQID | NVME_DNR;
4095     }
4096 
4097     cq = n->cq[qid];
4098     if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4099         trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4100         return NVME_INVALID_QUEUE_DEL;
4101     }
4102     nvme_irq_deassert(n, cq);
4103     trace_pci_nvme_del_cq(qid);
4104     nvme_free_cq(cq, n);
4105     return NVME_SUCCESS;
4106 }
4107 
4108 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4109                          uint16_t cqid, uint16_t vector, uint16_t size,
4110                          uint16_t irq_enabled)
4111 {
4112     int ret;
4113 
4114     if (msix_enabled(&n->parent_obj)) {
4115         ret = msix_vector_use(&n->parent_obj, vector);
4116         assert(ret == 0);
4117     }
4118     cq->ctrl = n;
4119     cq->cqid = cqid;
4120     cq->size = size;
4121     cq->dma_addr = dma_addr;
4122     cq->phase = 1;
4123     cq->irq_enabled = irq_enabled;
4124     cq->vector = vector;
4125     cq->head = cq->tail = 0;
4126     QTAILQ_INIT(&cq->req_list);
4127     QTAILQ_INIT(&cq->sq_list);
4128     n->cq[cqid] = cq;
4129     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4130 }
4131 
4132 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4133 {
4134     NvmeCQueue *cq;
4135     NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4136     uint16_t cqid = le16_to_cpu(c->cqid);
4137     uint16_t vector = le16_to_cpu(c->irq_vector);
4138     uint16_t qsize = le16_to_cpu(c->qsize);
4139     uint16_t qflags = le16_to_cpu(c->cq_flags);
4140     uint64_t prp1 = le64_to_cpu(c->prp1);
4141 
4142     trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4143                              NVME_CQ_FLAGS_IEN(qflags) != 0);
4144 
4145     if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
4146         n->cq[cqid] != NULL)) {
4147         trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4148         return NVME_INVALID_QID | NVME_DNR;
4149     }
4150     if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
4151         trace_pci_nvme_err_invalid_create_cq_size(qsize);
4152         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4153     }
4154     if (unlikely(prp1 & (n->page_size - 1))) {
4155         trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4156         return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4157     }
4158     if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4159         trace_pci_nvme_err_invalid_create_cq_vector(vector);
4160         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4161     }
4162     if (unlikely(vector >= n->params.msix_qsize)) {
4163         trace_pci_nvme_err_invalid_create_cq_vector(vector);
4164         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4165     }
4166     if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4167         trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4168         return NVME_INVALID_FIELD | NVME_DNR;
4169     }
4170 
4171     cq = g_malloc0(sizeof(*cq));
4172     nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4173                  NVME_CQ_FLAGS_IEN(qflags));
4174 
4175     /*
4176      * It is only required to set qs_created when creating a completion queue;
4177      * creating a submission queue without a matching completion queue will
4178      * fail.
4179      */
4180     n->qs_created = true;
4181     return NVME_SUCCESS;
4182 }
4183 
4184 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4185 {
4186     uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4187 
4188     return nvme_c2h(n, id, sizeof(id), req);
4189 }
4190 
4191 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4192 {
4193     trace_pci_nvme_identify_ctrl();
4194 
4195     return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4196 }
4197 
4198 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4199 {
4200     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4201     uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4202     NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4203 
4204     trace_pci_nvme_identify_ctrl_csi(c->csi);
4205 
4206     switch (c->csi) {
4207     case NVME_CSI_NVM:
4208         id_nvm->vsl = n->params.vsl;
4209         id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4210         break;
4211 
4212     case NVME_CSI_ZONED:
4213         ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4214         break;
4215 
4216     default:
4217         return NVME_INVALID_FIELD | NVME_DNR;
4218     }
4219 
4220     return nvme_c2h(n, id, sizeof(id), req);
4221 }
4222 
4223 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4224 {
4225     NvmeNamespace *ns;
4226     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4227     uint32_t nsid = le32_to_cpu(c->nsid);
4228 
4229     trace_pci_nvme_identify_ns(nsid);
4230 
4231     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4232         return NVME_INVALID_NSID | NVME_DNR;
4233     }
4234 
4235     ns = nvme_ns(n, nsid);
4236     if (unlikely(!ns)) {
4237         if (!active) {
4238             ns = nvme_subsys_ns(n->subsys, nsid);
4239             if (!ns) {
4240                 return nvme_rpt_empty_id_struct(n, req);
4241             }
4242         } else {
4243             return nvme_rpt_empty_id_struct(n, req);
4244         }
4245     }
4246 
4247     if (active || ns->csi == NVME_CSI_NVM) {
4248         return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4249     }
4250 
4251     return NVME_INVALID_CMD_SET | NVME_DNR;
4252 }
4253 
4254 static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req)
4255 {
4256     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4257     uint16_t min_id = le16_to_cpu(c->ctrlid);
4258     uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4259     uint16_t *ids = &list[1];
4260     NvmeNamespace *ns;
4261     NvmeCtrl *ctrl;
4262     int cntlid, nr_ids = 0;
4263 
4264     trace_pci_nvme_identify_ns_attached_list(min_id);
4265 
4266     if (c->nsid == NVME_NSID_BROADCAST) {
4267         return NVME_INVALID_FIELD | NVME_DNR;
4268     }
4269 
4270     ns = nvme_subsys_ns(n->subsys, c->nsid);
4271     if (!ns) {
4272         return NVME_INVALID_FIELD | NVME_DNR;
4273     }
4274 
4275     for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4276         ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4277         if (!ctrl) {
4278             continue;
4279         }
4280 
4281         if (!nvme_ns(ctrl, c->nsid)) {
4282             continue;
4283         }
4284 
4285         ids[nr_ids++] = cntlid;
4286     }
4287 
4288     list[0] = nr_ids;
4289 
4290     return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4291 }
4292 
4293 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4294                                      bool active)
4295 {
4296     NvmeNamespace *ns;
4297     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4298     uint32_t nsid = le32_to_cpu(c->nsid);
4299 
4300     trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4301 
4302     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4303         return NVME_INVALID_NSID | NVME_DNR;
4304     }
4305 
4306     ns = nvme_ns(n, nsid);
4307     if (unlikely(!ns)) {
4308         if (!active) {
4309             ns = nvme_subsys_ns(n->subsys, nsid);
4310             if (!ns) {
4311                 return nvme_rpt_empty_id_struct(n, req);
4312             }
4313         } else {
4314             return nvme_rpt_empty_id_struct(n, req);
4315         }
4316     }
4317 
4318     if (c->csi == NVME_CSI_NVM) {
4319         return nvme_rpt_empty_id_struct(n, req);
4320     } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
4321         return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
4322                         req);
4323     }
4324 
4325     return NVME_INVALID_FIELD | NVME_DNR;
4326 }
4327 
4328 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
4329                                      bool active)
4330 {
4331     NvmeNamespace *ns;
4332     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4333     uint32_t min_nsid = le32_to_cpu(c->nsid);
4334     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4335     static const int data_len = sizeof(list);
4336     uint32_t *list_ptr = (uint32_t *)list;
4337     int i, j = 0;
4338 
4339     trace_pci_nvme_identify_nslist(min_nsid);
4340 
4341     /*
4342      * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
4343      * since the Active Namespace ID List should return namespaces with ids
4344      * *higher* than the NSID specified in the command. This is also specified
4345      * in the spec (NVM Express v1.3d, Section 5.15.4).
4346      */
4347     if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4348         return NVME_INVALID_NSID | NVME_DNR;
4349     }
4350 
4351     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4352         ns = nvme_ns(n, i);
4353         if (!ns) {
4354             if (!active) {
4355                 ns = nvme_subsys_ns(n->subsys, i);
4356                 if (!ns) {
4357                     continue;
4358                 }
4359             } else {
4360                 continue;
4361             }
4362         }
4363         if (ns->params.nsid <= min_nsid) {
4364             continue;
4365         }
4366         list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4367         if (j == data_len / sizeof(uint32_t)) {
4368             break;
4369         }
4370     }
4371 
4372     return nvme_c2h(n, list, data_len, req);
4373 }
4374 
4375 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
4376                                          bool active)
4377 {
4378     NvmeNamespace *ns;
4379     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4380     uint32_t min_nsid = le32_to_cpu(c->nsid);
4381     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4382     static const int data_len = sizeof(list);
4383     uint32_t *list_ptr = (uint32_t *)list;
4384     int i, j = 0;
4385 
4386     trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
4387 
4388     /*
4389      * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
4390      */
4391     if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4392         return NVME_INVALID_NSID | NVME_DNR;
4393     }
4394 
4395     if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
4396         return NVME_INVALID_FIELD | NVME_DNR;
4397     }
4398 
4399     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4400         ns = nvme_ns(n, i);
4401         if (!ns) {
4402             if (!active) {
4403                 ns = nvme_subsys_ns(n->subsys, i);
4404                 if (!ns) {
4405                     continue;
4406                 }
4407             } else {
4408                 continue;
4409             }
4410         }
4411         if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
4412             continue;
4413         }
4414         list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4415         if (j == data_len / sizeof(uint32_t)) {
4416             break;
4417         }
4418     }
4419 
4420     return nvme_c2h(n, list, data_len, req);
4421 }
4422 
4423 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
4424 {
4425     NvmeNamespace *ns;
4426     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4427     uint32_t nsid = le32_to_cpu(c->nsid);
4428     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4429     uint8_t *pos = list;
4430     struct {
4431         NvmeIdNsDescr hdr;
4432         uint8_t v[NVME_NIDL_UUID];
4433     } QEMU_PACKED uuid;
4434     struct {
4435         NvmeIdNsDescr hdr;
4436         uint64_t v;
4437     } QEMU_PACKED eui64;
4438     struct {
4439         NvmeIdNsDescr hdr;
4440         uint8_t v;
4441     } QEMU_PACKED csi;
4442 
4443     trace_pci_nvme_identify_ns_descr_list(nsid);
4444 
4445     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4446         return NVME_INVALID_NSID | NVME_DNR;
4447     }
4448 
4449     ns = nvme_ns(n, nsid);
4450     if (unlikely(!ns)) {
4451         return NVME_INVALID_FIELD | NVME_DNR;
4452     }
4453 
4454     /*
4455      * If the EUI-64 field is 0 and the NGUID field is 0, the namespace must
4456      * provide a valid Namespace UUID in the Namespace Identification Descriptor
4457      * data structure. QEMU does not yet support setting NGUID.
4458      */
4459     uuid.hdr.nidt = NVME_NIDT_UUID;
4460     uuid.hdr.nidl = NVME_NIDL_UUID;
4461     memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
4462     memcpy(pos, &uuid, sizeof(uuid));
4463     pos += sizeof(uuid);
4464 
4465     if (ns->params.eui64) {
4466         eui64.hdr.nidt = NVME_NIDT_EUI64;
4467         eui64.hdr.nidl = NVME_NIDL_EUI64;
4468         eui64.v = cpu_to_be64(ns->params.eui64);
4469         memcpy(pos, &eui64, sizeof(eui64));
4470         pos += sizeof(eui64);
4471     }
4472 
4473     csi.hdr.nidt = NVME_NIDT_CSI;
4474     csi.hdr.nidl = NVME_NIDL_CSI;
4475     csi.v = ns->csi;
4476     memcpy(pos, &csi, sizeof(csi));
4477     pos += sizeof(csi);
4478 
4479     return nvme_c2h(n, list, sizeof(list), req);
4480 }
4481 
4482 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
4483 {
4484     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4485     static const int data_len = sizeof(list);
4486 
4487     trace_pci_nvme_identify_cmd_set();
4488 
4489     NVME_SET_CSI(*list, NVME_CSI_NVM);
4490     NVME_SET_CSI(*list, NVME_CSI_ZONED);
4491 
4492     return nvme_c2h(n, list, data_len, req);
4493 }
4494 
4495 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
4496 {
4497     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4498 
4499     trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
4500                             c->csi);
4501 
4502     switch (c->cns) {
4503     case NVME_ID_CNS_NS:
4504         return nvme_identify_ns(n, req, true);
4505     case NVME_ID_CNS_NS_PRESENT:
4506         return nvme_identify_ns(n, req, false);
4507     case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
4508         return nvme_identify_ns_attached_list(n, req);
4509     case NVME_ID_CNS_CS_NS:
4510         return nvme_identify_ns_csi(n, req, true);
4511     case NVME_ID_CNS_CS_NS_PRESENT:
4512         return nvme_identify_ns_csi(n, req, false);
4513     case NVME_ID_CNS_CTRL:
4514         return nvme_identify_ctrl(n, req);
4515     case NVME_ID_CNS_CS_CTRL:
4516         return nvme_identify_ctrl_csi(n, req);
4517     case NVME_ID_CNS_NS_ACTIVE_LIST:
4518         return nvme_identify_nslist(n, req, true);
4519     case NVME_ID_CNS_NS_PRESENT_LIST:
4520         return nvme_identify_nslist(n, req, false);
4521     case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
4522         return nvme_identify_nslist_csi(n, req, true);
4523     case NVME_ID_CNS_CS_NS_PRESENT_LIST:
4524         return nvme_identify_nslist_csi(n, req, false);
4525     case NVME_ID_CNS_NS_DESCR_LIST:
4526         return nvme_identify_ns_descr_list(n, req);
4527     case NVME_ID_CNS_IO_COMMAND_SET:
4528         return nvme_identify_cmd_set(n, req);
4529     default:
4530         trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
4531         return NVME_INVALID_FIELD | NVME_DNR;
4532     }
4533 }
4534 
4535 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
4536 {
4537     uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
4538 
4539     req->cqe.result = 1;
4540     if (nvme_check_sqid(n, sqid)) {
4541         return NVME_INVALID_FIELD | NVME_DNR;
4542     }
4543 
4544     return NVME_SUCCESS;
4545 }
4546 
4547 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
4548 {
4549     trace_pci_nvme_setfeat_timestamp(ts);
4550 
4551     n->host_timestamp = le64_to_cpu(ts);
4552     n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4553 }
4554 
4555 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
4556 {
4557     uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4558     uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
4559 
4560     union nvme_timestamp {
4561         struct {
4562             uint64_t timestamp:48;
4563             uint64_t sync:1;
4564             uint64_t origin:3;
4565             uint64_t rsvd1:12;
4566         };
4567         uint64_t all;
4568     };
4569 
4570     union nvme_timestamp ts;
4571     ts.all = 0;
4572     ts.timestamp = n->host_timestamp + elapsed_time;
4573 
4574     /* If the host timestamp is non-zero, set the timestamp origin */
4575     ts.origin = n->host_timestamp ? 0x01 : 0x00;
4576 
4577     trace_pci_nvme_getfeat_timestamp(ts.all);
4578 
4579     return cpu_to_le64(ts.all);
4580 }
4581 
4582 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4583 {
4584     uint64_t timestamp = nvme_get_timestamp(n);
4585 
4586     return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
4587 }
4588 
4589 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
4590 {
4591     NvmeCmd *cmd = &req->cmd;
4592     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4593     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4594     uint32_t nsid = le32_to_cpu(cmd->nsid);
4595     uint32_t result;
4596     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4597     NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
4598     uint16_t iv;
4599     NvmeNamespace *ns;
4600     int i;
4601 
4602     static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
4603         [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
4604     };
4605 
4606     trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
4607 
4608     if (!nvme_feature_support[fid]) {
4609         return NVME_INVALID_FIELD | NVME_DNR;
4610     }
4611 
4612     if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4613         if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4614             /*
4615              * The Reservation Notification Mask and Reservation Persistence
4616              * features require a status code of Invalid Field in Command when
4617              * NSID is FFFFFFFFh. Since the device does not support those
4618              * features we can always return Invalid Namespace or Format as we
4619              * should do for all other features.
4620              */
4621             return NVME_INVALID_NSID | NVME_DNR;
4622         }
4623 
4624         if (!nvme_ns(n, nsid)) {
4625             return NVME_INVALID_FIELD | NVME_DNR;
4626         }
4627     }
4628 
4629     switch (sel) {
4630     case NVME_GETFEAT_SELECT_CURRENT:
4631         break;
4632     case NVME_GETFEAT_SELECT_SAVED:
4633         /* no features are saveable by the controller; fallthrough */
4634     case NVME_GETFEAT_SELECT_DEFAULT:
4635         goto defaults;
4636     case NVME_GETFEAT_SELECT_CAP:
4637         result = nvme_feature_cap[fid];
4638         goto out;
4639     }
4640 
4641     switch (fid) {
4642     case NVME_TEMPERATURE_THRESHOLD:
4643         result = 0;
4644 
4645         /*
4646          * The controller only implements the Composite Temperature sensor, so
4647          * return 0 for all other sensors.
4648          */
4649         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4650             goto out;
4651         }
4652 
4653         switch (NVME_TEMP_THSEL(dw11)) {
4654         case NVME_TEMP_THSEL_OVER:
4655             result = n->features.temp_thresh_hi;
4656             goto out;
4657         case NVME_TEMP_THSEL_UNDER:
4658             result = n->features.temp_thresh_low;
4659             goto out;
4660         }
4661 
4662         return NVME_INVALID_FIELD | NVME_DNR;
4663     case NVME_ERROR_RECOVERY:
4664         if (!nvme_nsid_valid(n, nsid)) {
4665             return NVME_INVALID_NSID | NVME_DNR;
4666         }
4667 
4668         ns = nvme_ns(n, nsid);
4669         if (unlikely(!ns)) {
4670             return NVME_INVALID_FIELD | NVME_DNR;
4671         }
4672 
4673         result = ns->features.err_rec;
4674         goto out;
4675     case NVME_VOLATILE_WRITE_CACHE:
4676         result = 0;
4677         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4678             ns = nvme_ns(n, i);
4679             if (!ns) {
4680                 continue;
4681             }
4682 
4683             result = blk_enable_write_cache(ns->blkconf.blk);
4684             if (result) {
4685                 break;
4686             }
4687         }
4688         trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
4689         goto out;
4690     case NVME_ASYNCHRONOUS_EVENT_CONF:
4691         result = n->features.async_config;
4692         goto out;
4693     case NVME_TIMESTAMP:
4694         return nvme_get_feature_timestamp(n, req);
4695     default:
4696         break;
4697     }
4698 
4699 defaults:
4700     switch (fid) {
4701     case NVME_TEMPERATURE_THRESHOLD:
4702         result = 0;
4703 
4704         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4705             break;
4706         }
4707 
4708         if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
4709             result = NVME_TEMPERATURE_WARNING;
4710         }
4711 
4712         break;
4713     case NVME_NUMBER_OF_QUEUES:
4714         result = (n->params.max_ioqpairs - 1) |
4715             ((n->params.max_ioqpairs - 1) << 16);
4716         trace_pci_nvme_getfeat_numq(result);
4717         break;
4718     case NVME_INTERRUPT_VECTOR_CONF:
4719         iv = dw11 & 0xffff;
4720         if (iv >= n->params.max_ioqpairs + 1) {
4721             return NVME_INVALID_FIELD | NVME_DNR;
4722         }
4723 
4724         result = iv;
4725         if (iv == n->admin_cq.vector) {
4726             result |= NVME_INTVC_NOCOALESCING;
4727         }
4728         break;
4729     default:
4730         result = nvme_feature_default[fid];
4731         break;
4732     }
4733 
4734 out:
4735     req->cqe.result = cpu_to_le32(result);
4736     return NVME_SUCCESS;
4737 }
4738 
4739 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4740 {
4741     uint16_t ret;
4742     uint64_t timestamp;
4743 
4744     ret = nvme_h2c(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
4745     if (ret) {
4746         return ret;
4747     }
4748 
4749     nvme_set_timestamp(n, timestamp);
4750 
4751     return NVME_SUCCESS;
4752 }
4753 
4754 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
4755 {
4756     NvmeNamespace *ns = NULL;
4757 
4758     NvmeCmd *cmd = &req->cmd;
4759     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4760     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4761     uint32_t nsid = le32_to_cpu(cmd->nsid);
4762     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4763     uint8_t save = NVME_SETFEAT_SAVE(dw10);
4764     int i;
4765 
4766     trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
4767 
4768     if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
4769         return NVME_FID_NOT_SAVEABLE | NVME_DNR;
4770     }
4771 
4772     if (!nvme_feature_support[fid]) {
4773         return NVME_INVALID_FIELD | NVME_DNR;
4774     }
4775 
4776     if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4777         if (nsid != NVME_NSID_BROADCAST) {
4778             if (!nvme_nsid_valid(n, nsid)) {
4779                 return NVME_INVALID_NSID | NVME_DNR;
4780             }
4781 
4782             ns = nvme_ns(n, nsid);
4783             if (unlikely(!ns)) {
4784                 return NVME_INVALID_FIELD | NVME_DNR;
4785             }
4786         }
4787     } else if (nsid && nsid != NVME_NSID_BROADCAST) {
4788         if (!nvme_nsid_valid(n, nsid)) {
4789             return NVME_INVALID_NSID | NVME_DNR;
4790         }
4791 
4792         return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
4793     }
4794 
4795     if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
4796         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
4797     }
4798 
4799     switch (fid) {
4800     case NVME_TEMPERATURE_THRESHOLD:
4801         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4802             break;
4803         }
4804 
4805         switch (NVME_TEMP_THSEL(dw11)) {
4806         case NVME_TEMP_THSEL_OVER:
4807             n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
4808             break;
4809         case NVME_TEMP_THSEL_UNDER:
4810             n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
4811             break;
4812         default:
4813             return NVME_INVALID_FIELD | NVME_DNR;
4814         }
4815 
4816         if ((n->temperature >= n->features.temp_thresh_hi) ||
4817             (n->temperature <= n->features.temp_thresh_low)) {
4818             nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
4819         }
4820 
4821         break;
4822     case NVME_ERROR_RECOVERY:
4823         if (nsid == NVME_NSID_BROADCAST) {
4824             for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4825                 ns = nvme_ns(n, i);
4826 
4827                 if (!ns) {
4828                     continue;
4829                 }
4830 
4831                 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
4832                     ns->features.err_rec = dw11;
4833                 }
4834             }
4835 
4836             break;
4837         }
4838 
4839         assert(ns);
4840         if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat))  {
4841             ns->features.err_rec = dw11;
4842         }
4843         break;
4844     case NVME_VOLATILE_WRITE_CACHE:
4845         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4846             ns = nvme_ns(n, i);
4847             if (!ns) {
4848                 continue;
4849             }
4850 
4851             if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
4852                 blk_flush(ns->blkconf.blk);
4853             }
4854 
4855             blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
4856         }
4857 
4858         break;
4859 
4860     case NVME_NUMBER_OF_QUEUES:
4861         if (n->qs_created) {
4862             return NVME_CMD_SEQ_ERROR | NVME_DNR;
4863         }
4864 
4865         /*
4866          * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
4867          * and NSQR.
4868          */
4869         if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
4870             return NVME_INVALID_FIELD | NVME_DNR;
4871         }
4872 
4873         trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
4874                                     ((dw11 >> 16) & 0xffff) + 1,
4875                                     n->params.max_ioqpairs,
4876                                     n->params.max_ioqpairs);
4877         req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
4878                                       ((n->params.max_ioqpairs - 1) << 16));
4879         break;
4880     case NVME_ASYNCHRONOUS_EVENT_CONF:
4881         n->features.async_config = dw11;
4882         break;
4883     case NVME_TIMESTAMP:
4884         return nvme_set_feature_timestamp(n, req);
4885     case NVME_COMMAND_SET_PROFILE:
4886         if (dw11 & 0x1ff) {
4887             trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
4888             return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
4889         }
4890         break;
4891     default:
4892         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
4893     }
4894     return NVME_SUCCESS;
4895 }
4896 
4897 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
4898 {
4899     trace_pci_nvme_aer(nvme_cid(req));
4900 
4901     if (n->outstanding_aers > n->params.aerl) {
4902         trace_pci_nvme_aer_aerl_exceeded();
4903         return NVME_AER_LIMIT_EXCEEDED;
4904     }
4905 
4906     n->aer_reqs[n->outstanding_aers] = req;
4907     n->outstanding_aers++;
4908 
4909     if (!QTAILQ_EMPTY(&n->aer_queue)) {
4910         nvme_process_aers(n);
4911     }
4912 
4913     return NVME_NO_COMPLETE;
4914 }
4915 
4916 static void nvme_update_dmrsl(NvmeCtrl *n)
4917 {
4918     int nsid;
4919 
4920     for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
4921         NvmeNamespace *ns = nvme_ns(n, nsid);
4922         if (!ns) {
4923             continue;
4924         }
4925 
4926         n->dmrsl = MIN_NON_ZERO(n->dmrsl,
4927                                 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
4928     }
4929 }
4930 
4931 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
4932 {
4933     ns->iocs = nvme_cse_iocs_none;
4934     switch (ns->csi) {
4935     case NVME_CSI_NVM:
4936         if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
4937             ns->iocs = nvme_cse_iocs_nvm;
4938         }
4939         break;
4940     case NVME_CSI_ZONED:
4941         if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
4942             ns->iocs = nvme_cse_iocs_zoned;
4943         } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
4944             ns->iocs = nvme_cse_iocs_nvm;
4945         }
4946         break;
4947     }
4948 }
4949 
4950 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
4951 {
4952     NvmeNamespace *ns;
4953     NvmeCtrl *ctrl;
4954     uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4955     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4956     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
4957     bool attach = !(dw10 & 0xf);
4958     uint16_t *nr_ids = &list[0];
4959     uint16_t *ids = &list[1];
4960     uint16_t ret;
4961     int i;
4962 
4963     trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
4964 
4965     if (!nvme_nsid_valid(n, nsid)) {
4966         return NVME_INVALID_NSID | NVME_DNR;
4967     }
4968 
4969     ns = nvme_subsys_ns(n->subsys, nsid);
4970     if (!ns) {
4971         return NVME_INVALID_FIELD | NVME_DNR;
4972     }
4973 
4974     ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
4975     if (ret) {
4976         return ret;
4977     }
4978 
4979     if (!*nr_ids) {
4980         return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
4981     }
4982 
4983     *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
4984     for (i = 0; i < *nr_ids; i++) {
4985         ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
4986         if (!ctrl) {
4987             return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
4988         }
4989 
4990         if (attach) {
4991             if (nvme_ns(ctrl, nsid)) {
4992                 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
4993             }
4994 
4995             if (ns->attached && !ns->params.shared) {
4996                 return NVME_NS_PRIVATE | NVME_DNR;
4997             }
4998 
4999             nvme_attach_ns(ctrl, ns);
5000             nvme_select_iocs_ns(ctrl, ns);
5001         } else {
5002             if (!nvme_ns(ctrl, nsid)) {
5003                 return NVME_NS_NOT_ATTACHED | NVME_DNR;
5004             }
5005 
5006             ctrl->namespaces[nsid] = NULL;
5007             ns->attached--;
5008 
5009             nvme_update_dmrsl(ctrl);
5010         }
5011 
5012         /*
5013          * Add namespace id to the changed namespace id list for event clearing
5014          * via Get Log Page command.
5015          */
5016         if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
5017             nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
5018                                NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
5019                                NVME_LOG_CHANGED_NSLIST);
5020         }
5021     }
5022 
5023     return NVME_SUCCESS;
5024 }
5025 
5026 static uint16_t nvme_format_ns(NvmeCtrl *n, NvmeNamespace *ns, uint8_t lbaf,
5027                                uint8_t mset, uint8_t pi, uint8_t pil,
5028                                NvmeRequest *req)
5029 {
5030     int64_t len, offset;
5031     struct nvme_aio_format_ctx *ctx;
5032     BlockBackend *blk = ns->blkconf.blk;
5033     uint16_t ms;
5034     uintptr_t *num_formats = (uintptr_t *)&req->opaque;
5035     int *count;
5036 
5037     if (ns->params.zoned) {
5038         return NVME_INVALID_FORMAT | NVME_DNR;
5039     }
5040 
5041     trace_pci_nvme_format_ns(nvme_cid(req), nvme_nsid(ns), lbaf, mset, pi, pil);
5042 
5043     if (lbaf > ns->id_ns.nlbaf) {
5044         return NVME_INVALID_FORMAT | NVME_DNR;
5045     }
5046 
5047     ms = ns->id_ns.lbaf[lbaf].ms;
5048 
5049     if (pi && (ms < sizeof(NvmeDifTuple))) {
5050         return NVME_INVALID_FORMAT | NVME_DNR;
5051     }
5052 
5053     if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5054         return NVME_INVALID_FIELD | NVME_DNR;
5055     }
5056 
5057     nvme_ns_drain(ns);
5058     nvme_ns_shutdown(ns);
5059     nvme_ns_cleanup(ns);
5060 
5061     ns->id_ns.dps = (pil << 3) | pi;
5062     ns->id_ns.flbas = lbaf | (mset << 4);
5063 
5064     nvme_ns_init_format(ns);
5065 
5066     ns->status = NVME_FORMAT_IN_PROGRESS;
5067 
5068     len = ns->size;
5069     offset = 0;
5070 
5071     count = g_new(int, 1);
5072     *count = 1;
5073 
5074     (*num_formats)++;
5075 
5076     while (len) {
5077         ctx = g_new(struct nvme_aio_format_ctx, 1);
5078         ctx->req = req;
5079         ctx->ns = ns;
5080         ctx->count = count;
5081 
5082         size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
5083 
5084         (*count)++;
5085 
5086         blk_aio_pwrite_zeroes(blk, offset, bytes, BDRV_REQ_MAY_UNMAP,
5087                               nvme_aio_format_cb, ctx);
5088 
5089         offset += bytes;
5090         len -= bytes;
5091 
5092     }
5093 
5094     if (--(*count)) {
5095         return NVME_NO_COMPLETE;
5096     }
5097 
5098     g_free(count);
5099     ns->status = 0x0;
5100     (*num_formats)--;
5101 
5102     return NVME_SUCCESS;
5103 }
5104 
5105 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5106 {
5107     NvmeNamespace *ns;
5108     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5109     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5110     uint8_t lbaf = dw10 & 0xf;
5111     uint8_t mset = (dw10 >> 4) & 0x1;
5112     uint8_t pi = (dw10 >> 5) & 0x7;
5113     uint8_t pil = (dw10 >> 8) & 0x1;
5114     uintptr_t *num_formats = (uintptr_t *)&req->opaque;
5115     uint16_t status;
5116     int i;
5117 
5118     trace_pci_nvme_format(nvme_cid(req), nsid, lbaf, mset, pi, pil);
5119 
5120     /* 1-initialize; see the comment in nvme_dsm */
5121     *num_formats = 1;
5122 
5123     if (nsid != NVME_NSID_BROADCAST) {
5124         if (!nvme_nsid_valid(n, nsid)) {
5125             return NVME_INVALID_NSID | NVME_DNR;
5126         }
5127 
5128         ns = nvme_ns(n, nsid);
5129         if (!ns) {
5130             return NVME_INVALID_FIELD | NVME_DNR;
5131         }
5132 
5133         status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req);
5134         if (status && status != NVME_NO_COMPLETE) {
5135             req->status = status;
5136         }
5137     } else {
5138         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5139             ns = nvme_ns(n, i);
5140             if (!ns) {
5141                 continue;
5142             }
5143 
5144             status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req);
5145             if (status && status != NVME_NO_COMPLETE) {
5146                 req->status = status;
5147                 break;
5148             }
5149         }
5150     }
5151 
5152     /* account for the 1-initialization */
5153     if (--(*num_formats)) {
5154         return NVME_NO_COMPLETE;
5155     }
5156 
5157     return req->status;
5158 }
5159 
5160 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
5161 {
5162     trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
5163                              nvme_adm_opc_str(req->cmd.opcode));
5164 
5165     if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
5166         trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
5167         return NVME_INVALID_OPCODE | NVME_DNR;
5168     }
5169 
5170     /* SGLs shall not be used for Admin commands in NVMe over PCIe */
5171     if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
5172         return NVME_INVALID_FIELD | NVME_DNR;
5173     }
5174 
5175     switch (req->cmd.opcode) {
5176     case NVME_ADM_CMD_DELETE_SQ:
5177         return nvme_del_sq(n, req);
5178     case NVME_ADM_CMD_CREATE_SQ:
5179         return nvme_create_sq(n, req);
5180     case NVME_ADM_CMD_GET_LOG_PAGE:
5181         return nvme_get_log(n, req);
5182     case NVME_ADM_CMD_DELETE_CQ:
5183         return nvme_del_cq(n, req);
5184     case NVME_ADM_CMD_CREATE_CQ:
5185         return nvme_create_cq(n, req);
5186     case NVME_ADM_CMD_IDENTIFY:
5187         return nvme_identify(n, req);
5188     case NVME_ADM_CMD_ABORT:
5189         return nvme_abort(n, req);
5190     case NVME_ADM_CMD_SET_FEATURES:
5191         return nvme_set_feature(n, req);
5192     case NVME_ADM_CMD_GET_FEATURES:
5193         return nvme_get_feature(n, req);
5194     case NVME_ADM_CMD_ASYNC_EV_REQ:
5195         return nvme_aer(n, req);
5196     case NVME_ADM_CMD_NS_ATTACHMENT:
5197         return nvme_ns_attachment(n, req);
5198     case NVME_ADM_CMD_FORMAT_NVM:
5199         return nvme_format(n, req);
5200     default:
5201         assert(false);
5202     }
5203 
5204     return NVME_INVALID_OPCODE | NVME_DNR;
5205 }
5206 
5207 static void nvme_process_sq(void *opaque)
5208 {
5209     NvmeSQueue *sq = opaque;
5210     NvmeCtrl *n = sq->ctrl;
5211     NvmeCQueue *cq = n->cq[sq->cqid];
5212 
5213     uint16_t status;
5214     hwaddr addr;
5215     NvmeCmd cmd;
5216     NvmeRequest *req;
5217 
5218     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
5219         addr = sq->dma_addr + sq->head * n->sqe_size;
5220         if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
5221             trace_pci_nvme_err_addr_read(addr);
5222             trace_pci_nvme_err_cfs();
5223             n->bar.csts = NVME_CSTS_FAILED;
5224             break;
5225         }
5226         nvme_inc_sq_head(sq);
5227 
5228         req = QTAILQ_FIRST(&sq->req_list);
5229         QTAILQ_REMOVE(&sq->req_list, req, entry);
5230         QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
5231         nvme_req_clear(req);
5232         req->cqe.cid = cmd.cid;
5233         memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
5234 
5235         status = sq->sqid ? nvme_io_cmd(n, req) :
5236             nvme_admin_cmd(n, req);
5237         if (status != NVME_NO_COMPLETE) {
5238             req->status = status;
5239             nvme_enqueue_req_completion(cq, req);
5240         }
5241     }
5242 }
5243 
5244 static void nvme_ctrl_reset(NvmeCtrl *n)
5245 {
5246     NvmeNamespace *ns;
5247     int i;
5248 
5249     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5250         ns = nvme_ns(n, i);
5251         if (!ns) {
5252             continue;
5253         }
5254 
5255         nvme_ns_drain(ns);
5256     }
5257 
5258     for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5259         if (n->sq[i] != NULL) {
5260             nvme_free_sq(n->sq[i], n);
5261         }
5262     }
5263     for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5264         if (n->cq[i] != NULL) {
5265             nvme_free_cq(n->cq[i], n);
5266         }
5267     }
5268 
5269     while (!QTAILQ_EMPTY(&n->aer_queue)) {
5270         NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
5271         QTAILQ_REMOVE(&n->aer_queue, event, entry);
5272         g_free(event);
5273     }
5274 
5275     n->aer_queued = 0;
5276     n->outstanding_aers = 0;
5277     n->qs_created = false;
5278 
5279     n->bar.cc = 0;
5280 }
5281 
5282 static void nvme_ctrl_shutdown(NvmeCtrl *n)
5283 {
5284     NvmeNamespace *ns;
5285     int i;
5286 
5287     if (n->pmr.dev) {
5288         memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5289     }
5290 
5291     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5292         ns = nvme_ns(n, i);
5293         if (!ns) {
5294             continue;
5295         }
5296 
5297         nvme_ns_shutdown(ns);
5298     }
5299 }
5300 
5301 static void nvme_select_iocs(NvmeCtrl *n)
5302 {
5303     NvmeNamespace *ns;
5304     int i;
5305 
5306     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5307         ns = nvme_ns(n, i);
5308         if (!ns) {
5309             continue;
5310         }
5311 
5312         nvme_select_iocs_ns(n, ns);
5313     }
5314 }
5315 
5316 static int nvme_start_ctrl(NvmeCtrl *n)
5317 {
5318     uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
5319     uint32_t page_size = 1 << page_bits;
5320 
5321     if (unlikely(n->cq[0])) {
5322         trace_pci_nvme_err_startfail_cq();
5323         return -1;
5324     }
5325     if (unlikely(n->sq[0])) {
5326         trace_pci_nvme_err_startfail_sq();
5327         return -1;
5328     }
5329     if (unlikely(!n->bar.asq)) {
5330         trace_pci_nvme_err_startfail_nbarasq();
5331         return -1;
5332     }
5333     if (unlikely(!n->bar.acq)) {
5334         trace_pci_nvme_err_startfail_nbaracq();
5335         return -1;
5336     }
5337     if (unlikely(n->bar.asq & (page_size - 1))) {
5338         trace_pci_nvme_err_startfail_asq_misaligned(n->bar.asq);
5339         return -1;
5340     }
5341     if (unlikely(n->bar.acq & (page_size - 1))) {
5342         trace_pci_nvme_err_startfail_acq_misaligned(n->bar.acq);
5343         return -1;
5344     }
5345     if (unlikely(!(NVME_CAP_CSS(n->bar.cap) & (1 << NVME_CC_CSS(n->bar.cc))))) {
5346         trace_pci_nvme_err_startfail_css(NVME_CC_CSS(n->bar.cc));
5347         return -1;
5348     }
5349     if (unlikely(NVME_CC_MPS(n->bar.cc) <
5350                  NVME_CAP_MPSMIN(n->bar.cap))) {
5351         trace_pci_nvme_err_startfail_page_too_small(
5352                     NVME_CC_MPS(n->bar.cc),
5353                     NVME_CAP_MPSMIN(n->bar.cap));
5354         return -1;
5355     }
5356     if (unlikely(NVME_CC_MPS(n->bar.cc) >
5357                  NVME_CAP_MPSMAX(n->bar.cap))) {
5358         trace_pci_nvme_err_startfail_page_too_large(
5359                     NVME_CC_MPS(n->bar.cc),
5360                     NVME_CAP_MPSMAX(n->bar.cap));
5361         return -1;
5362     }
5363     if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
5364                  NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
5365         trace_pci_nvme_err_startfail_cqent_too_small(
5366                     NVME_CC_IOCQES(n->bar.cc),
5367                     NVME_CTRL_CQES_MIN(n->bar.cap));
5368         return -1;
5369     }
5370     if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
5371                  NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
5372         trace_pci_nvme_err_startfail_cqent_too_large(
5373                     NVME_CC_IOCQES(n->bar.cc),
5374                     NVME_CTRL_CQES_MAX(n->bar.cap));
5375         return -1;
5376     }
5377     if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
5378                  NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
5379         trace_pci_nvme_err_startfail_sqent_too_small(
5380                     NVME_CC_IOSQES(n->bar.cc),
5381                     NVME_CTRL_SQES_MIN(n->bar.cap));
5382         return -1;
5383     }
5384     if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
5385                  NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
5386         trace_pci_nvme_err_startfail_sqent_too_large(
5387                     NVME_CC_IOSQES(n->bar.cc),
5388                     NVME_CTRL_SQES_MAX(n->bar.cap));
5389         return -1;
5390     }
5391     if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
5392         trace_pci_nvme_err_startfail_asqent_sz_zero();
5393         return -1;
5394     }
5395     if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
5396         trace_pci_nvme_err_startfail_acqent_sz_zero();
5397         return -1;
5398     }
5399 
5400     n->page_bits = page_bits;
5401     n->page_size = page_size;
5402     n->max_prp_ents = n->page_size / sizeof(uint64_t);
5403     n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
5404     n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
5405     nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0,
5406                  NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
5407     nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
5408                  NVME_AQA_ASQS(n->bar.aqa) + 1);
5409 
5410     nvme_set_timestamp(n, 0ULL);
5411 
5412     QTAILQ_INIT(&n->aer_queue);
5413 
5414     nvme_select_iocs(n);
5415 
5416     return 0;
5417 }
5418 
5419 static void nvme_cmb_enable_regs(NvmeCtrl *n)
5420 {
5421     NVME_CMBLOC_SET_CDPCILS(n->bar.cmbloc, 1);
5422     NVME_CMBLOC_SET_CDPMLS(n->bar.cmbloc, 1);
5423     NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
5424 
5425     NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
5426     NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
5427     NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1);
5428     NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
5429     NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
5430     NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
5431     NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
5432 }
5433 
5434 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
5435                            unsigned size)
5436 {
5437     if (unlikely(offset & (sizeof(uint32_t) - 1))) {
5438         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
5439                        "MMIO write not 32-bit aligned,"
5440                        " offset=0x%"PRIx64"", offset);
5441         /* should be ignored, fall through for now */
5442     }
5443 
5444     if (unlikely(size < sizeof(uint32_t))) {
5445         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
5446                        "MMIO write smaller than 32-bits,"
5447                        " offset=0x%"PRIx64", size=%u",
5448                        offset, size);
5449         /* should be ignored, fall through for now */
5450     }
5451 
5452     switch (offset) {
5453     case 0xc:   /* INTMS */
5454         if (unlikely(msix_enabled(&(n->parent_obj)))) {
5455             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5456                            "undefined access to interrupt mask set"
5457                            " when MSI-X is enabled");
5458             /* should be ignored, fall through for now */
5459         }
5460         n->bar.intms |= data & 0xffffffff;
5461         n->bar.intmc = n->bar.intms;
5462         trace_pci_nvme_mmio_intm_set(data & 0xffffffff, n->bar.intmc);
5463         nvme_irq_check(n);
5464         break;
5465     case 0x10:  /* INTMC */
5466         if (unlikely(msix_enabled(&(n->parent_obj)))) {
5467             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5468                            "undefined access to interrupt mask clr"
5469                            " when MSI-X is enabled");
5470             /* should be ignored, fall through for now */
5471         }
5472         n->bar.intms &= ~(data & 0xffffffff);
5473         n->bar.intmc = n->bar.intms;
5474         trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, n->bar.intmc);
5475         nvme_irq_check(n);
5476         break;
5477     case 0x14:  /* CC */
5478         trace_pci_nvme_mmio_cfg(data & 0xffffffff);
5479         /* Windows first sends data, then sends enable bit */
5480         if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
5481             !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
5482         {
5483             n->bar.cc = data;
5484         }
5485 
5486         if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
5487             n->bar.cc = data;
5488             if (unlikely(nvme_start_ctrl(n))) {
5489                 trace_pci_nvme_err_startfail();
5490                 n->bar.csts = NVME_CSTS_FAILED;
5491             } else {
5492                 trace_pci_nvme_mmio_start_success();
5493                 n->bar.csts = NVME_CSTS_READY;
5494             }
5495         } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
5496             trace_pci_nvme_mmio_stopped();
5497             nvme_ctrl_reset(n);
5498             n->bar.csts &= ~NVME_CSTS_READY;
5499         }
5500         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
5501             trace_pci_nvme_mmio_shutdown_set();
5502             nvme_ctrl_shutdown(n);
5503             n->bar.cc = data;
5504             n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
5505         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
5506             trace_pci_nvme_mmio_shutdown_cleared();
5507             n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
5508             n->bar.cc = data;
5509         }
5510         break;
5511     case 0x1c:  /* CSTS */
5512         if (data & (1 << 4)) {
5513             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
5514                            "attempted to W1C CSTS.NSSRO"
5515                            " but CAP.NSSRS is zero (not supported)");
5516         } else if (data != 0) {
5517             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
5518                            "attempted to set a read only bit"
5519                            " of controller status");
5520         }
5521         break;
5522     case 0x20:  /* NSSR */
5523         if (data == 0x4e564d65) {
5524             trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
5525         } else {
5526             /* The spec says that writes of other values have no effect */
5527             return;
5528         }
5529         break;
5530     case 0x24:  /* AQA */
5531         n->bar.aqa = data & 0xffffffff;
5532         trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
5533         break;
5534     case 0x28:  /* ASQ */
5535         n->bar.asq = size == 8 ? data :
5536             (n->bar.asq & ~0xffffffffULL) | (data & 0xffffffff);
5537         trace_pci_nvme_mmio_asqaddr(data);
5538         break;
5539     case 0x2c:  /* ASQ hi */
5540         n->bar.asq = (n->bar.asq & 0xffffffff) | (data << 32);
5541         trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq);
5542         break;
5543     case 0x30:  /* ACQ */
5544         trace_pci_nvme_mmio_acqaddr(data);
5545         n->bar.acq = size == 8 ? data :
5546             (n->bar.acq & ~0xffffffffULL) | (data & 0xffffffff);
5547         break;
5548     case 0x34:  /* ACQ hi */
5549         n->bar.acq = (n->bar.acq & 0xffffffff) | (data << 32);
5550         trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq);
5551         break;
5552     case 0x38:  /* CMBLOC */
5553         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
5554                        "invalid write to reserved CMBLOC"
5555                        " when CMBSZ is zero, ignored");
5556         return;
5557     case 0x3C:  /* CMBSZ */
5558         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
5559                        "invalid write to read only CMBSZ, ignored");
5560         return;
5561     case 0x50:  /* CMBMSC */
5562         if (!NVME_CAP_CMBS(n->bar.cap)) {
5563             return;
5564         }
5565 
5566         n->bar.cmbmsc = size == 8 ? data :
5567             (n->bar.cmbmsc & ~0xffffffff) | (data & 0xffffffff);
5568         n->cmb.cmse = false;
5569 
5570         if (NVME_CMBMSC_CRE(data)) {
5571             nvme_cmb_enable_regs(n);
5572 
5573             if (NVME_CMBMSC_CMSE(data)) {
5574                 hwaddr cba = NVME_CMBMSC_CBA(data) << CMBMSC_CBA_SHIFT;
5575                 if (cba + int128_get64(n->cmb.mem.size) < cba) {
5576                     NVME_CMBSTS_SET_CBAI(n->bar.cmbsts, 1);
5577                     return;
5578                 }
5579 
5580                 n->cmb.cba = cba;
5581                 n->cmb.cmse = true;
5582             }
5583         } else {
5584             n->bar.cmbsz = 0;
5585             n->bar.cmbloc = 0;
5586         }
5587 
5588         return;
5589     case 0x54:  /* CMBMSC hi */
5590         n->bar.cmbmsc = (n->bar.cmbmsc & 0xffffffff) | (data << 32);
5591         return;
5592 
5593     case 0xe00: /* PMRCAP */
5594         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
5595                        "invalid write to PMRCAP register, ignored");
5596         return;
5597     case 0xe04: /* PMRCTL */
5598         n->bar.pmrctl = data;
5599         if (NVME_PMRCTL_EN(data)) {
5600             memory_region_set_enabled(&n->pmr.dev->mr, true);
5601             n->bar.pmrsts = 0;
5602         } else {
5603             memory_region_set_enabled(&n->pmr.dev->mr, false);
5604             NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1);
5605             n->pmr.cmse = false;
5606         }
5607         return;
5608     case 0xe08: /* PMRSTS */
5609         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
5610                        "invalid write to PMRSTS register, ignored");
5611         return;
5612     case 0xe0C: /* PMREBS */
5613         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
5614                        "invalid write to PMREBS register, ignored");
5615         return;
5616     case 0xe10: /* PMRSWTP */
5617         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
5618                        "invalid write to PMRSWTP register, ignored");
5619         return;
5620     case 0xe14: /* PMRMSCL */
5621         if (!NVME_CAP_PMRS(n->bar.cap)) {
5622             return;
5623         }
5624 
5625         n->bar.pmrmsc = (n->bar.pmrmsc & ~0xffffffff) | (data & 0xffffffff);
5626         n->pmr.cmse = false;
5627 
5628         if (NVME_PMRMSC_CMSE(n->bar.pmrmsc)) {
5629             hwaddr cba = NVME_PMRMSC_CBA(n->bar.pmrmsc) << PMRMSC_CBA_SHIFT;
5630             if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
5631                 NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 1);
5632                 return;
5633             }
5634 
5635             n->pmr.cmse = true;
5636             n->pmr.cba = cba;
5637         }
5638 
5639         return;
5640     case 0xe18: /* PMRMSCU */
5641         if (!NVME_CAP_PMRS(n->bar.cap)) {
5642             return;
5643         }
5644 
5645         n->bar.pmrmsc = (n->bar.pmrmsc & 0xffffffff) | (data << 32);
5646         return;
5647     default:
5648         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
5649                        "invalid MMIO write,"
5650                        " offset=0x%"PRIx64", data=%"PRIx64"",
5651                        offset, data);
5652         break;
5653     }
5654 }
5655 
5656 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
5657 {
5658     NvmeCtrl *n = (NvmeCtrl *)opaque;
5659     uint8_t *ptr = (uint8_t *)&n->bar;
5660     uint64_t val = 0;
5661 
5662     trace_pci_nvme_mmio_read(addr, size);
5663 
5664     if (unlikely(addr & (sizeof(uint32_t) - 1))) {
5665         NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
5666                        "MMIO read not 32-bit aligned,"
5667                        " offset=0x%"PRIx64"", addr);
5668         /* should RAZ, fall through for now */
5669     } else if (unlikely(size < sizeof(uint32_t))) {
5670         NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
5671                        "MMIO read smaller than 32-bits,"
5672                        " offset=0x%"PRIx64"", addr);
5673         /* should RAZ, fall through for now */
5674     }
5675 
5676     if (addr < sizeof(n->bar)) {
5677         /*
5678          * When PMRWBM bit 1 is set then read from
5679          * from PMRSTS should ensure prior writes
5680          * made it to persistent media
5681          */
5682         if (addr == 0xe08 &&
5683             (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) {
5684             memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5685         }
5686         memcpy(&val, ptr + addr, size);
5687     } else {
5688         NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
5689                        "MMIO read beyond last register,"
5690                        " offset=0x%"PRIx64", returning 0", addr);
5691     }
5692 
5693     return val;
5694 }
5695 
5696 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
5697 {
5698     uint32_t qid;
5699 
5700     if (unlikely(addr & ((1 << 2) - 1))) {
5701         NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
5702                        "doorbell write not 32-bit aligned,"
5703                        " offset=0x%"PRIx64", ignoring", addr);
5704         return;
5705     }
5706 
5707     if (((addr - 0x1000) >> 2) & 1) {
5708         /* Completion queue doorbell write */
5709 
5710         uint16_t new_head = val & 0xffff;
5711         int start_sqs;
5712         NvmeCQueue *cq;
5713 
5714         qid = (addr - (0x1000 + (1 << 2))) >> 3;
5715         if (unlikely(nvme_check_cqid(n, qid))) {
5716             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
5717                            "completion queue doorbell write"
5718                            " for nonexistent queue,"
5719                            " sqid=%"PRIu32", ignoring", qid);
5720 
5721             /*
5722              * NVM Express v1.3d, Section 4.1 state: "If host software writes
5723              * an invalid value to the Submission Queue Tail Doorbell or
5724              * Completion Queue Head Doorbell regiter and an Asynchronous Event
5725              * Request command is outstanding, then an asynchronous event is
5726              * posted to the Admin Completion Queue with a status code of
5727              * Invalid Doorbell Write Value."
5728              *
5729              * Also note that the spec includes the "Invalid Doorbell Register"
5730              * status code, but nowhere does it specify when to use it.
5731              * However, it seems reasonable to use it here in a similar
5732              * fashion.
5733              */
5734             if (n->outstanding_aers) {
5735                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5736                                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
5737                                    NVME_LOG_ERROR_INFO);
5738             }
5739 
5740             return;
5741         }
5742 
5743         cq = n->cq[qid];
5744         if (unlikely(new_head >= cq->size)) {
5745             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
5746                            "completion queue doorbell write value"
5747                            " beyond queue size, sqid=%"PRIu32","
5748                            " new_head=%"PRIu16", ignoring",
5749                            qid, new_head);
5750 
5751             if (n->outstanding_aers) {
5752                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5753                                    NVME_AER_INFO_ERR_INVALID_DB_VALUE,
5754                                    NVME_LOG_ERROR_INFO);
5755             }
5756 
5757             return;
5758         }
5759 
5760         trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
5761 
5762         start_sqs = nvme_cq_full(cq) ? 1 : 0;
5763         cq->head = new_head;
5764         if (start_sqs) {
5765             NvmeSQueue *sq;
5766             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
5767                 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
5768             }
5769             timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
5770         }
5771 
5772         if (cq->tail == cq->head) {
5773             nvme_irq_deassert(n, cq);
5774         }
5775     } else {
5776         /* Submission queue doorbell write */
5777 
5778         uint16_t new_tail = val & 0xffff;
5779         NvmeSQueue *sq;
5780 
5781         qid = (addr - 0x1000) >> 3;
5782         if (unlikely(nvme_check_sqid(n, qid))) {
5783             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
5784                            "submission queue doorbell write"
5785                            " for nonexistent queue,"
5786                            " sqid=%"PRIu32", ignoring", qid);
5787 
5788             if (n->outstanding_aers) {
5789                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5790                                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
5791                                    NVME_LOG_ERROR_INFO);
5792             }
5793 
5794             return;
5795         }
5796 
5797         sq = n->sq[qid];
5798         if (unlikely(new_tail >= sq->size)) {
5799             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
5800                            "submission queue doorbell write value"
5801                            " beyond queue size, sqid=%"PRIu32","
5802                            " new_tail=%"PRIu16", ignoring",
5803                            qid, new_tail);
5804 
5805             if (n->outstanding_aers) {
5806                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5807                                    NVME_AER_INFO_ERR_INVALID_DB_VALUE,
5808                                    NVME_LOG_ERROR_INFO);
5809             }
5810 
5811             return;
5812         }
5813 
5814         trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
5815 
5816         sq->tail = new_tail;
5817         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
5818     }
5819 }
5820 
5821 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
5822                             unsigned size)
5823 {
5824     NvmeCtrl *n = (NvmeCtrl *)opaque;
5825 
5826     trace_pci_nvme_mmio_write(addr, data, size);
5827 
5828     if (addr < sizeof(n->bar)) {
5829         nvme_write_bar(n, addr, data, size);
5830     } else {
5831         nvme_process_db(n, addr, data);
5832     }
5833 }
5834 
5835 static const MemoryRegionOps nvme_mmio_ops = {
5836     .read = nvme_mmio_read,
5837     .write = nvme_mmio_write,
5838     .endianness = DEVICE_LITTLE_ENDIAN,
5839     .impl = {
5840         .min_access_size = 2,
5841         .max_access_size = 8,
5842     },
5843 };
5844 
5845 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
5846                            unsigned size)
5847 {
5848     NvmeCtrl *n = (NvmeCtrl *)opaque;
5849     stn_le_p(&n->cmb.buf[addr], size, data);
5850 }
5851 
5852 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
5853 {
5854     NvmeCtrl *n = (NvmeCtrl *)opaque;
5855     return ldn_le_p(&n->cmb.buf[addr], size);
5856 }
5857 
5858 static const MemoryRegionOps nvme_cmb_ops = {
5859     .read = nvme_cmb_read,
5860     .write = nvme_cmb_write,
5861     .endianness = DEVICE_LITTLE_ENDIAN,
5862     .impl = {
5863         .min_access_size = 1,
5864         .max_access_size = 8,
5865     },
5866 };
5867 
5868 static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
5869 {
5870     NvmeParams *params = &n->params;
5871 
5872     if (params->num_queues) {
5873         warn_report("num_queues is deprecated; please use max_ioqpairs "
5874                     "instead");
5875 
5876         params->max_ioqpairs = params->num_queues - 1;
5877     }
5878 
5879     if (n->namespace.blkconf.blk && n->subsys) {
5880         error_setg(errp, "subsystem support is unavailable with legacy "
5881                    "namespace ('drive' property)");
5882         return;
5883     }
5884 
5885     if (params->max_ioqpairs < 1 ||
5886         params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
5887         error_setg(errp, "max_ioqpairs must be between 1 and %d",
5888                    NVME_MAX_IOQPAIRS);
5889         return;
5890     }
5891 
5892     if (params->msix_qsize < 1 ||
5893         params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
5894         error_setg(errp, "msix_qsize must be between 1 and %d",
5895                    PCI_MSIX_FLAGS_QSIZE + 1);
5896         return;
5897     }
5898 
5899     if (!params->serial) {
5900         error_setg(errp, "serial property not set");
5901         return;
5902     }
5903 
5904     if (n->pmr.dev) {
5905         if (host_memory_backend_is_mapped(n->pmr.dev)) {
5906             error_setg(errp, "can't use already busy memdev: %s",
5907                        object_get_canonical_path_component(OBJECT(n->pmr.dev)));
5908             return;
5909         }
5910 
5911         if (!is_power_of_2(n->pmr.dev->size)) {
5912             error_setg(errp, "pmr backend size needs to be power of 2 in size");
5913             return;
5914         }
5915 
5916         host_memory_backend_set_mapped(n->pmr.dev, true);
5917     }
5918 
5919     if (n->params.zasl > n->params.mdts) {
5920         error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
5921                    "than or equal to mdts (Maximum Data Transfer Size)");
5922         return;
5923     }
5924 
5925     if (!n->params.vsl) {
5926         error_setg(errp, "vsl must be non-zero");
5927         return;
5928     }
5929 }
5930 
5931 static void nvme_init_state(NvmeCtrl *n)
5932 {
5933     /* add one to max_ioqpairs to account for the admin queue pair */
5934     n->reg_size = pow2ceil(sizeof(NvmeBar) +
5935                            2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
5936     n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
5937     n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
5938     n->temperature = NVME_TEMPERATURE;
5939     n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
5940     n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5941     n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
5942 }
5943 
5944 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
5945 {
5946     uint64_t cmb_size = n->params.cmb_size_mb * MiB;
5947 
5948     n->cmb.buf = g_malloc0(cmb_size);
5949     memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
5950                           "nvme-cmb", cmb_size);
5951     pci_register_bar(pci_dev, NVME_CMB_BIR,
5952                      PCI_BASE_ADDRESS_SPACE_MEMORY |
5953                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
5954                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
5955 
5956     NVME_CAP_SET_CMBS(n->bar.cap, 1);
5957 
5958     if (n->params.legacy_cmb) {
5959         nvme_cmb_enable_regs(n);
5960         n->cmb.cmse = true;
5961     }
5962 }
5963 
5964 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
5965 {
5966     NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 1);
5967     NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 1);
5968     NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR);
5969     /* Turn on bit 1 support */
5970     NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
5971     NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 1);
5972 
5973     pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
5974                      PCI_BASE_ADDRESS_SPACE_MEMORY |
5975                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
5976                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
5977 
5978     memory_region_set_enabled(&n->pmr.dev->mr, false);
5979 }
5980 
5981 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
5982 {
5983     uint8_t *pci_conf = pci_dev->config;
5984     uint64_t bar_size, msix_table_size, msix_pba_size;
5985     unsigned msix_table_offset, msix_pba_offset;
5986     int ret;
5987 
5988     Error *err = NULL;
5989 
5990     pci_conf[PCI_INTERRUPT_PIN] = 1;
5991     pci_config_set_prog_interface(pci_conf, 0x2);
5992 
5993     if (n->params.use_intel_id) {
5994         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
5995         pci_config_set_device_id(pci_conf, 0x5845);
5996     } else {
5997         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
5998         pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
5999     }
6000 
6001     pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
6002     pcie_endpoint_cap_init(pci_dev, 0x80);
6003 
6004     bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
6005     msix_table_offset = bar_size;
6006     msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
6007 
6008     bar_size += msix_table_size;
6009     bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
6010     msix_pba_offset = bar_size;
6011     msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
6012 
6013     bar_size += msix_pba_size;
6014     bar_size = pow2ceil(bar_size);
6015 
6016     memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
6017     memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
6018                           n->reg_size);
6019     memory_region_add_subregion(&n->bar0, 0, &n->iomem);
6020 
6021     pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
6022                      PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
6023     ret = msix_init(pci_dev, n->params.msix_qsize,
6024                     &n->bar0, 0, msix_table_offset,
6025                     &n->bar0, 0, msix_pba_offset, 0, &err);
6026     if (ret < 0) {
6027         if (ret == -ENOTSUP) {
6028             warn_report_err(err);
6029         } else {
6030             error_propagate(errp, err);
6031             return ret;
6032         }
6033     }
6034 
6035     if (n->params.cmb_size_mb) {
6036         nvme_init_cmb(n, pci_dev);
6037     }
6038 
6039     if (n->pmr.dev) {
6040         nvme_init_pmr(n, pci_dev);
6041     }
6042 
6043     return 0;
6044 }
6045 
6046 static void nvme_init_subnqn(NvmeCtrl *n)
6047 {
6048     NvmeSubsystem *subsys = n->subsys;
6049     NvmeIdCtrl *id = &n->id_ctrl;
6050 
6051     if (!subsys) {
6052         snprintf((char *)id->subnqn, sizeof(id->subnqn),
6053                  "nqn.2019-08.org.qemu:%s", n->params.serial);
6054     } else {
6055         pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
6056     }
6057 }
6058 
6059 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
6060 {
6061     NvmeIdCtrl *id = &n->id_ctrl;
6062     uint8_t *pci_conf = pci_dev->config;
6063 
6064     id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
6065     id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
6066     strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
6067     strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
6068     strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
6069 
6070     id->cntlid = cpu_to_le16(n->cntlid);
6071 
6072     id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
6073 
6074     id->rab = 6;
6075 
6076     if (n->params.use_intel_id) {
6077         id->ieee[0] = 0xb3;
6078         id->ieee[1] = 0x02;
6079         id->ieee[2] = 0x00;
6080     } else {
6081         id->ieee[0] = 0x00;
6082         id->ieee[1] = 0x54;
6083         id->ieee[2] = 0x52;
6084     }
6085 
6086     id->mdts = n->params.mdts;
6087     id->ver = cpu_to_le32(NVME_SPEC_VER);
6088     id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
6089     id->cntrltype = 0x1;
6090 
6091     /*
6092      * Because the controller always completes the Abort command immediately,
6093      * there can never be more than one concurrently executing Abort command,
6094      * so this value is never used for anything. Note that there can easily be
6095      * many Abort commands in the queues, but they are not considered
6096      * "executing" until processed by nvme_abort.
6097      *
6098      * The specification recommends a value of 3 for Abort Command Limit (four
6099      * concurrently outstanding Abort commands), so lets use that though it is
6100      * inconsequential.
6101      */
6102     id->acl = 3;
6103     id->aerl = n->params.aerl;
6104     id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
6105     id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
6106 
6107     /* recommended default value (~70 C) */
6108     id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
6109     id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
6110 
6111     id->sqes = (0x6 << 4) | 0x6;
6112     id->cqes = (0x4 << 4) | 0x4;
6113     id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
6114     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
6115                            NVME_ONCS_FEATURES | NVME_ONCS_DSM |
6116                            NVME_ONCS_COMPARE | NVME_ONCS_COPY);
6117 
6118     /*
6119      * NOTE: If this device ever supports a command set that does NOT use 0x0
6120      * as a Flush-equivalent operation, support for the broadcast NSID in Flush
6121      * should probably be removed.
6122      *
6123      * See comment in nvme_io_cmd.
6124      */
6125     id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
6126 
6127     id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
6128     id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
6129                            NVME_CTRL_SGLS_BITBUCKET);
6130 
6131     nvme_init_subnqn(n);
6132 
6133     id->psd[0].mp = cpu_to_le16(0x9c4);
6134     id->psd[0].enlat = cpu_to_le32(0x10);
6135     id->psd[0].exlat = cpu_to_le32(0x4);
6136 
6137     if (n->subsys) {
6138         id->cmic |= NVME_CMIC_MULTI_CTRL;
6139     }
6140 
6141     NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
6142     NVME_CAP_SET_CQR(n->bar.cap, 1);
6143     NVME_CAP_SET_TO(n->bar.cap, 0xf);
6144     NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM);
6145     NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP);
6146     NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY);
6147     NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
6148     NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
6149     NVME_CAP_SET_PMRS(n->bar.cap, n->pmr.dev ? 1 : 0);
6150 
6151     n->bar.vs = NVME_SPEC_VER;
6152     n->bar.intmc = n->bar.intms = 0;
6153 }
6154 
6155 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
6156 {
6157     int cntlid;
6158 
6159     if (!n->subsys) {
6160         return 0;
6161     }
6162 
6163     cntlid = nvme_subsys_register_ctrl(n, errp);
6164     if (cntlid < 0) {
6165         return -1;
6166     }
6167 
6168     n->cntlid = cntlid;
6169 
6170     return 0;
6171 }
6172 
6173 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6174 {
6175     uint32_t nsid = ns->params.nsid;
6176     assert(nsid && nsid <= NVME_MAX_NAMESPACES);
6177 
6178     n->namespaces[nsid] = ns;
6179     ns->attached++;
6180 
6181     n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6182                             BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6183 }
6184 
6185 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
6186 {
6187     NvmeCtrl *n = NVME(pci_dev);
6188     NvmeNamespace *ns;
6189     Error *local_err = NULL;
6190 
6191     nvme_check_constraints(n, &local_err);
6192     if (local_err) {
6193         error_propagate(errp, local_err);
6194         return;
6195     }
6196 
6197     qbus_create_inplace(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
6198                         &pci_dev->qdev, n->parent_obj.qdev.id);
6199 
6200     nvme_init_state(n);
6201     if (nvme_init_pci(n, pci_dev, errp)) {
6202         return;
6203     }
6204 
6205     if (nvme_init_subsys(n, errp)) {
6206         error_propagate(errp, local_err);
6207         return;
6208     }
6209     nvme_init_ctrl(n, pci_dev);
6210 
6211     /* setup a namespace if the controller drive property was given */
6212     if (n->namespace.blkconf.blk) {
6213         ns = &n->namespace;
6214         ns->params.nsid = 1;
6215 
6216         if (nvme_ns_setup(n, ns, errp)) {
6217             return;
6218         }
6219 
6220         nvme_attach_ns(n, ns);
6221     }
6222 }
6223 
6224 static void nvme_exit(PCIDevice *pci_dev)
6225 {
6226     NvmeCtrl *n = NVME(pci_dev);
6227     NvmeNamespace *ns;
6228     int i;
6229 
6230     nvme_ctrl_reset(n);
6231 
6232     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6233         ns = nvme_ns(n, i);
6234         if (!ns) {
6235             continue;
6236         }
6237 
6238         nvme_ns_cleanup(ns);
6239     }
6240 
6241     g_free(n->cq);
6242     g_free(n->sq);
6243     g_free(n->aer_reqs);
6244 
6245     if (n->params.cmb_size_mb) {
6246         g_free(n->cmb.buf);
6247     }
6248 
6249     if (n->pmr.dev) {
6250         host_memory_backend_set_mapped(n->pmr.dev, false);
6251     }
6252     msix_uninit(pci_dev, &n->bar0, &n->bar0);
6253     memory_region_del_subregion(&n->bar0, &n->iomem);
6254 }
6255 
6256 static Property nvme_props[] = {
6257     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
6258     DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
6259                      HostMemoryBackend *),
6260     DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
6261                      NvmeSubsystem *),
6262     DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
6263     DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
6264     DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
6265     DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6266     DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
6267     DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
6268     DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
6269     DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
6270     DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
6271     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
6272     DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
6273     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
6274     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
6275                      params.auto_transition_zones, true),
6276     DEFINE_PROP_END_OF_LIST(),
6277 };
6278 
6279 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
6280                                    void *opaque, Error **errp)
6281 {
6282     NvmeCtrl *n = NVME(obj);
6283     uint8_t value = n->smart_critical_warning;
6284 
6285     visit_type_uint8(v, name, &value, errp);
6286 }
6287 
6288 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
6289                                    void *opaque, Error **errp)
6290 {
6291     NvmeCtrl *n = NVME(obj);
6292     uint8_t value, old_value, cap = 0, index, event;
6293 
6294     if (!visit_type_uint8(v, name, &value, errp)) {
6295         return;
6296     }
6297 
6298     cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
6299           | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
6300     if (NVME_CAP_PMRS(n->bar.cap)) {
6301         cap |= NVME_SMART_PMR_UNRELIABLE;
6302     }
6303 
6304     if ((value & cap) != value) {
6305         error_setg(errp, "unsupported smart critical warning bits: 0x%x",
6306                    value & ~cap);
6307         return;
6308     }
6309 
6310     old_value = n->smart_critical_warning;
6311     n->smart_critical_warning = value;
6312 
6313     /* only inject new bits of smart critical warning */
6314     for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
6315         event = 1 << index;
6316         if (value & ~old_value & event)
6317             nvme_smart_event(n, event);
6318     }
6319 }
6320 
6321 static const VMStateDescription nvme_vmstate = {
6322     .name = "nvme",
6323     .unmigratable = 1,
6324 };
6325 
6326 static void nvme_class_init(ObjectClass *oc, void *data)
6327 {
6328     DeviceClass *dc = DEVICE_CLASS(oc);
6329     PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
6330 
6331     pc->realize = nvme_realize;
6332     pc->exit = nvme_exit;
6333     pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
6334     pc->revision = 2;
6335 
6336     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
6337     dc->desc = "Non-Volatile Memory Express";
6338     device_class_set_props(dc, nvme_props);
6339     dc->vmsd = &nvme_vmstate;
6340 }
6341 
6342 static void nvme_instance_init(Object *obj)
6343 {
6344     NvmeCtrl *n = NVME(obj);
6345 
6346     device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
6347                                   "bootindex", "/namespace@1,0",
6348                                   DEVICE(obj));
6349 
6350     object_property_add(obj, "smart_critical_warning", "uint8",
6351                         nvme_get_smart_warning,
6352                         nvme_set_smart_warning, NULL, NULL);
6353 }
6354 
6355 static const TypeInfo nvme_info = {
6356     .name          = TYPE_NVME,
6357     .parent        = TYPE_PCI_DEVICE,
6358     .instance_size = sizeof(NvmeCtrl),
6359     .instance_init = nvme_instance_init,
6360     .class_init    = nvme_class_init,
6361     .interfaces = (InterfaceInfo[]) {
6362         { INTERFACE_PCIE_DEVICE },
6363         { }
6364     },
6365 };
6366 
6367 static const TypeInfo nvme_bus_info = {
6368     .name = TYPE_NVME_BUS,
6369     .parent = TYPE_BUS,
6370     .instance_size = sizeof(NvmeBus),
6371 };
6372 
6373 static void nvme_register_types(void)
6374 {
6375     type_register_static(&nvme_info);
6376     type_register_static(&nvme_bus_info);
6377 }
6378 
6379 type_init(nvme_register_types)
6380