xref: /openbmc/qemu/hw/nvme/ctrl.c (revision 2d1bf258)
1 /*
2  * QEMU NVM Express Controller
3  *
4  * Copyright (c) 2012, Intel Corporation
5  *
6  * Written by Keith Busch <keith.busch@intel.com>
7  *
8  * This code is licensed under the GNU GPL v2 or later.
9  */
10 
11 /**
12  * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13  *
14  *  https://nvmexpress.org/developers/nvme-specification/
15  *
16  *
17  * Notes on coding style
18  * ---------------------
19  * While QEMU coding style prefers lowercase hexadecimals in constants, the
20  * NVMe subsystem use this format from the NVMe specifications in the comments
21  * (i.e. 'h' suffix instead of '0x' prefix).
22  *
23  * Usage
24  * -----
25  * See docs/system/nvme.rst for extensive documentation.
26  *
27  * Add options:
28  *      -drive file=<file>,if=none,id=<drive_id>
29  *      -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30  *      -device nvme,serial=<serial>,id=<bus_name>, \
31  *              cmb_size_mb=<cmb_size_mb[optional]>, \
32  *              [pmrdev=<mem_backend_file_id>,] \
33  *              max_ioqpairs=<N[optional]>, \
34  *              aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35  *              mdts=<N[optional]>,vsl=<N[optional]>, \
36  *              zoned.zasl=<N[optional]>, \
37  *              zoned.auto_transition=<on|off[optional]>, \
38  *              sriov_max_vfs=<N[optional]> \
39  *              sriov_vq_flexible=<N[optional]> \
40  *              sriov_vi_flexible=<N[optional]> \
41  *              sriov_max_vi_per_vf=<N[optional]> \
42  *              sriov_max_vq_per_vf=<N[optional]> \
43  *              subsys=<subsys_id>
44  *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
45  *              zoned=<true|false[optional]>, \
46  *              subsys=<subsys_id>,shared=<true|false[optional]>, \
47  *              detached=<true|false[optional]>, \
48  *              zoned.zone_size=<N[optional]>, \
49  *              zoned.zone_capacity=<N[optional]>, \
50  *              zoned.descr_ext_size=<N[optional]>, \
51  *              zoned.max_active=<N[optional]>, \
52  *              zoned.max_open=<N[optional]>, \
53  *              zoned.cross_read=<true|false[optional]>
54  *
55  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
56  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
57  * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
58  * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
59  *
60  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
61  * For example:
62  * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
63  *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
64  *
65  * The PMR will use BAR 4/5 exclusively.
66  *
67  * To place controller(s) and namespace(s) to a subsystem, then provide
68  * nvme-subsys device as above.
69  *
70  * nvme subsystem device parameters
71  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
72  * - `nqn`
73  *   This parameter provides the `<nqn_id>` part of the string
74  *   `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
75  *   of subsystem controllers. Note that `<nqn_id>` should be unique per
76  *   subsystem, but this is not enforced by QEMU. If not specified, it will
77  *   default to the value of the `id` parameter (`<subsys_id>`).
78  *
79  * nvme device parameters
80  * ~~~~~~~~~~~~~~~~~~~~~~
81  * - `subsys`
82  *   Specifying this parameter attaches the controller to the subsystem and
83  *   the SUBNQN field in the controller will report the NQN of the subsystem
84  *   device. This also enables multi controller capability represented in
85  *   Identify Controller data structure in CMIC (Controller Multi-path I/O and
86  *   Namespace Sharing Capabilities).
87  *
88  * - `aerl`
89  *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
90  *   of concurrently outstanding Asynchronous Event Request commands support
91  *   by the controller. This is a 0's based value.
92  *
93  * - `aer_max_queued`
94  *   This is the maximum number of events that the device will enqueue for
95  *   completion when there are no outstanding AERs. When the maximum number of
96  *   enqueued events are reached, subsequent events will be dropped.
97  *
98  * - `mdts`
99  *   Indicates the maximum data transfer size for a command that transfers data
100  *   between host-accessible memory and the controller. The value is specified
101  *   as a power of two (2^n) and is in units of the minimum memory page size
102  *   (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
103  *
104  * - `vsl`
105  *   Indicates the maximum data size limit for the Verify command. Like `mdts`,
106  *   this value is specified as a power of two (2^n) and is in units of the
107  *   minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
108  *   KiB).
109  *
110  * - `zoned.zasl`
111  *   Indicates the maximum data transfer size for the Zone Append command. Like
112  *   `mdts`, the value is specified as a power of two (2^n) and is in units of
113  *   the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
114  *   defaulting to the value of `mdts`).
115  *
116  * - `zoned.auto_transition`
117  *   Indicates if zones in zone state implicitly opened can be automatically
118  *   transitioned to zone state closed for resource management purposes.
119  *   Defaults to 'on'.
120  *
121  * - `sriov_max_vfs`
122  *   Indicates the maximum number of PCIe virtual functions supported
123  *   by the controller. The default value is 0. Specifying a non-zero value
124  *   enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
125  *   Virtual function controllers will not report SR-IOV capability.
126  *
127  *   NOTE: Single Root I/O Virtualization support is experimental.
128  *   All the related parameters may be subject to change.
129  *
130  * - `sriov_vq_flexible`
131  *   Indicates the total number of flexible queue resources assignable to all
132  *   the secondary controllers. Implicitly sets the number of primary
133  *   controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`.
134  *
135  * - `sriov_vi_flexible`
136  *   Indicates the total number of flexible interrupt resources assignable to
137  *   all the secondary controllers. Implicitly sets the number of primary
138  *   controller's private resources to `(msix_qsize - sriov_vi_flexible)`.
139  *
140  * - `sriov_max_vi_per_vf`
141  *   Indicates the maximum number of virtual interrupt resources assignable
142  *   to a secondary controller. The default 0 resolves to
143  *   `(sriov_vi_flexible / sriov_max_vfs)`.
144  *
145  * - `sriov_max_vq_per_vf`
146  *   Indicates the maximum number of virtual queue resources assignable to
147  *   a secondary controller. The default 0 resolves to
148  *   `(sriov_vq_flexible / sriov_max_vfs)`.
149  *
150  * nvme namespace device parameters
151  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
152  * - `shared`
153  *   When the parent nvme device (as defined explicitly by the 'bus' parameter
154  *   or implicitly by the most recently defined NvmeBus) is linked to an
155  *   nvme-subsys device, the namespace will be attached to all controllers in
156  *   the subsystem. If set to 'off' (the default), the namespace will remain a
157  *   private namespace and may only be attached to a single controller at a
158  *   time.
159  *
160  * - `detached`
161  *   This parameter is only valid together with the `subsys` parameter. If left
162  *   at the default value (`false/off`), the namespace will be attached to all
163  *   controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
164  *   namespace will be available in the subsystem but not attached to any
165  *   controllers.
166  *
167  * Setting `zoned` to true selects Zoned Command Set at the namespace.
168  * In this case, the following namespace properties are available to configure
169  * zoned operation:
170  *     zoned.zone_size=<zone size in bytes, default: 128MiB>
171  *         The number may be followed by K, M, G as in kilo-, mega- or giga-.
172  *
173  *     zoned.zone_capacity=<zone capacity in bytes, default: zone size>
174  *         The value 0 (default) forces zone capacity to be the same as zone
175  *         size. The value of this property may not exceed zone size.
176  *
177  *     zoned.descr_ext_size=<zone descriptor extension size, default 0>
178  *         This value needs to be specified in 64B units. If it is zero,
179  *         namespace(s) will not support zone descriptor extensions.
180  *
181  *     zoned.max_active=<Maximum Active Resources (zones), default: 0>
182  *         The default value means there is no limit to the number of
183  *         concurrently active zones.
184  *
185  *     zoned.max_open=<Maximum Open Resources (zones), default: 0>
186  *         The default value means there is no limit to the number of
187  *         concurrently open zones.
188  *
189  *     zoned.cross_read=<enable RAZB, default: false>
190  *         Setting this property to true enables Read Across Zone Boundaries.
191  */
192 
193 #include "qemu/osdep.h"
194 #include "qemu/cutils.h"
195 #include "qemu/error-report.h"
196 #include "qemu/log.h"
197 #include "qemu/units.h"
198 #include "qemu/range.h"
199 #include "qapi/error.h"
200 #include "qapi/visitor.h"
201 #include "sysemu/sysemu.h"
202 #include "sysemu/block-backend.h"
203 #include "sysemu/hostmem.h"
204 #include "hw/pci/msix.h"
205 #include "hw/pci/pcie_sriov.h"
206 #include "migration/vmstate.h"
207 
208 #include "nvme.h"
209 #include "dif.h"
210 #include "trace.h"
211 
212 #define NVME_MAX_IOQPAIRS 0xffff
213 #define NVME_DB_SIZE  4
214 #define NVME_SPEC_VER 0x00010400
215 #define NVME_CMB_BIR 2
216 #define NVME_PMR_BIR 4
217 #define NVME_TEMPERATURE 0x143
218 #define NVME_TEMPERATURE_WARNING 0x157
219 #define NVME_TEMPERATURE_CRITICAL 0x175
220 #define NVME_NUM_FW_SLOTS 1
221 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
222 #define NVME_VF_RES_GRANULARITY 1
223 #define NVME_VF_OFFSET 0x1
224 #define NVME_VF_STRIDE 1
225 
226 #define NVME_GUEST_ERR(trace, fmt, ...) \
227     do { \
228         (trace_##trace)(__VA_ARGS__); \
229         qemu_log_mask(LOG_GUEST_ERROR, #trace \
230             " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
231     } while (0)
232 
233 static const bool nvme_feature_support[NVME_FID_MAX] = {
234     [NVME_ARBITRATION]              = true,
235     [NVME_POWER_MANAGEMENT]         = true,
236     [NVME_TEMPERATURE_THRESHOLD]    = true,
237     [NVME_ERROR_RECOVERY]           = true,
238     [NVME_VOLATILE_WRITE_CACHE]     = true,
239     [NVME_NUMBER_OF_QUEUES]         = true,
240     [NVME_INTERRUPT_COALESCING]     = true,
241     [NVME_INTERRUPT_VECTOR_CONF]    = true,
242     [NVME_WRITE_ATOMICITY]          = true,
243     [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
244     [NVME_TIMESTAMP]                = true,
245     [NVME_HOST_BEHAVIOR_SUPPORT]    = true,
246     [NVME_COMMAND_SET_PROFILE]      = true,
247     [NVME_FDP_MODE]                 = true,
248     [NVME_FDP_EVENTS]               = true,
249 };
250 
251 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
252     [NVME_TEMPERATURE_THRESHOLD]    = NVME_FEAT_CAP_CHANGE,
253     [NVME_ERROR_RECOVERY]           = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
254     [NVME_VOLATILE_WRITE_CACHE]     = NVME_FEAT_CAP_CHANGE,
255     [NVME_NUMBER_OF_QUEUES]         = NVME_FEAT_CAP_CHANGE,
256     [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
257     [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
258     [NVME_HOST_BEHAVIOR_SUPPORT]    = NVME_FEAT_CAP_CHANGE,
259     [NVME_COMMAND_SET_PROFILE]      = NVME_FEAT_CAP_CHANGE,
260     [NVME_FDP_MODE]                 = NVME_FEAT_CAP_CHANGE,
261     [NVME_FDP_EVENTS]               = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
262 };
263 
264 static const uint32_t nvme_cse_acs[256] = {
265     [NVME_ADM_CMD_DELETE_SQ]        = NVME_CMD_EFF_CSUPP,
266     [NVME_ADM_CMD_CREATE_SQ]        = NVME_CMD_EFF_CSUPP,
267     [NVME_ADM_CMD_GET_LOG_PAGE]     = NVME_CMD_EFF_CSUPP,
268     [NVME_ADM_CMD_DELETE_CQ]        = NVME_CMD_EFF_CSUPP,
269     [NVME_ADM_CMD_CREATE_CQ]        = NVME_CMD_EFF_CSUPP,
270     [NVME_ADM_CMD_IDENTIFY]         = NVME_CMD_EFF_CSUPP,
271     [NVME_ADM_CMD_ABORT]            = NVME_CMD_EFF_CSUPP,
272     [NVME_ADM_CMD_SET_FEATURES]     = NVME_CMD_EFF_CSUPP,
273     [NVME_ADM_CMD_GET_FEATURES]     = NVME_CMD_EFF_CSUPP,
274     [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
275     [NVME_ADM_CMD_NS_ATTACHMENT]    = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
276     [NVME_ADM_CMD_VIRT_MNGMT]       = NVME_CMD_EFF_CSUPP,
277     [NVME_ADM_CMD_DBBUF_CONFIG]     = NVME_CMD_EFF_CSUPP,
278     [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
279     [NVME_ADM_CMD_DIRECTIVE_RECV]   = NVME_CMD_EFF_CSUPP,
280     [NVME_ADM_CMD_DIRECTIVE_SEND]   = NVME_CMD_EFF_CSUPP,
281 };
282 
283 static const uint32_t nvme_cse_iocs_none[256];
284 
285 static const uint32_t nvme_cse_iocs_nvm[256] = {
286     [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
287     [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
288     [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
289     [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
290     [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
291     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
292     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
293     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
294     [NVME_CMD_IO_MGMT_RECV]         = NVME_CMD_EFF_CSUPP,
295     [NVME_CMD_IO_MGMT_SEND]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
296 };
297 
298 static const uint32_t nvme_cse_iocs_zoned[256] = {
299     [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
300     [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
301     [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
302     [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
303     [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
304     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
305     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
306     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
307     [NVME_CMD_ZONE_APPEND]          = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
308     [NVME_CMD_ZONE_MGMT_SEND]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
309     [NVME_CMD_ZONE_MGMT_RECV]       = NVME_CMD_EFF_CSUPP,
310 };
311 
312 static void nvme_process_sq(void *opaque);
313 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
314 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n);
315 
316 static uint16_t nvme_sqid(NvmeRequest *req)
317 {
318     return le16_to_cpu(req->sq->sqid);
319 }
320 
321 static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg,
322                                      uint16_t ph)
323 {
324     uint16_t rgif = ns->endgrp->fdp.rgif;
325 
326     if (!rgif) {
327         return ph;
328     }
329 
330     return (rg << (16 - rgif)) | ph;
331 }
332 
333 static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph)
334 {
335     return ph < ns->fdp.nphs;
336 }
337 
338 static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg)
339 {
340     return rg < endgrp->fdp.nrg;
341 }
342 
343 static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid)
344 {
345     uint16_t rgif = ns->endgrp->fdp.rgif;
346 
347     if (!rgif) {
348         return pid;
349     }
350 
351     return pid & ((1 << (15 - rgif)) - 1);
352 }
353 
354 static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid)
355 {
356     uint16_t rgif = ns->endgrp->fdp.rgif;
357 
358     if (!rgif) {
359         return 0;
360     }
361 
362     return pid >> (16 - rgif);
363 }
364 
365 static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid,
366                                   uint16_t *ph, uint16_t *rg)
367 {
368     *rg = nvme_pid2rg(ns, pid);
369     *ph = nvme_pid2ph(ns, pid);
370 
371     return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg);
372 }
373 
374 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
375                                    NvmeZoneState state)
376 {
377     if (QTAILQ_IN_USE(zone, entry)) {
378         switch (nvme_get_zone_state(zone)) {
379         case NVME_ZONE_STATE_EXPLICITLY_OPEN:
380             QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
381             break;
382         case NVME_ZONE_STATE_IMPLICITLY_OPEN:
383             QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
384             break;
385         case NVME_ZONE_STATE_CLOSED:
386             QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
387             break;
388         case NVME_ZONE_STATE_FULL:
389             QTAILQ_REMOVE(&ns->full_zones, zone, entry);
390         default:
391             ;
392         }
393     }
394 
395     nvme_set_zone_state(zone, state);
396 
397     switch (state) {
398     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
399         QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
400         break;
401     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
402         QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
403         break;
404     case NVME_ZONE_STATE_CLOSED:
405         QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
406         break;
407     case NVME_ZONE_STATE_FULL:
408         QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
409     case NVME_ZONE_STATE_READ_ONLY:
410         break;
411     default:
412         zone->d.za = 0;
413     }
414 }
415 
416 static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
417                                          uint32_t opn, uint32_t zrwa)
418 {
419     if (ns->params.max_active_zones != 0 &&
420         ns->nr_active_zones + act > ns->params.max_active_zones) {
421         trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
422         return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
423     }
424 
425     if (ns->params.max_open_zones != 0 &&
426         ns->nr_open_zones + opn > ns->params.max_open_zones) {
427         trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
428         return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
429     }
430 
431     if (zrwa > ns->zns.numzrwa) {
432         return NVME_NOZRWA | NVME_DNR;
433     }
434 
435     return NVME_SUCCESS;
436 }
437 
438 /*
439  * Check if we can open a zone without exceeding open/active limits.
440  * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
441  */
442 static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
443 {
444     return nvme_zns_check_resources(ns, act, opn, 0);
445 }
446 
447 static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf)
448 {
449     NvmeFdpEvent *ret = NULL;
450     bool is_full = ebuf->next == ebuf->start && ebuf->nelems;
451 
452     ret = &ebuf->events[ebuf->next++];
453     if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) {
454         ebuf->next = 0;
455     }
456     if (is_full) {
457         ebuf->start = ebuf->next;
458     } else {
459         ebuf->nelems++;
460     }
461 
462     memset(ret, 0, sizeof(NvmeFdpEvent));
463     ret->timestamp = nvme_get_timestamp(n);
464 
465     return ret;
466 }
467 
468 static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type)
469 {
470     return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1;
471 }
472 
473 static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid)
474 {
475     NvmeEnduranceGroup *endgrp = ns->endgrp;
476     NvmeRuHandle *ruh;
477     NvmeReclaimUnit *ru;
478     NvmeFdpEvent *e = NULL;
479     uint16_t ph, rg, ruhid;
480 
481     if (!nvme_parse_pid(ns, pid, &ph, &rg)) {
482         return false;
483     }
484 
485     ruhid = ns->fdp.phs[ph];
486 
487     ruh = &endgrp->fdp.ruhs[ruhid];
488     ru = &ruh->rus[rg];
489 
490     if (ru->ruamw) {
491         if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) {
492             e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events);
493             e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN;
494             e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV;
495             e->pid = cpu_to_le16(pid);
496             e->nsid = cpu_to_le32(ns->params.nsid);
497             e->rgid = cpu_to_le16(rg);
498             e->ruhid = cpu_to_le16(ruhid);
499         }
500 
501         /* log (eventual) GC overhead of prematurely swapping the RU */
502         nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw));
503     }
504 
505     ru->ruamw = ruh->ruamw;
506 
507     return true;
508 }
509 
510 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
511 {
512     hwaddr hi, lo;
513 
514     if (!n->cmb.cmse) {
515         return false;
516     }
517 
518     lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
519     hi = lo + int128_get64(n->cmb.mem.size);
520 
521     return addr >= lo && addr < hi;
522 }
523 
524 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
525 {
526     hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
527     return &n->cmb.buf[addr - base];
528 }
529 
530 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
531 {
532     hwaddr hi;
533 
534     if (!n->pmr.cmse) {
535         return false;
536     }
537 
538     hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
539 
540     return addr >= n->pmr.cba && addr < hi;
541 }
542 
543 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
544 {
545     return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
546 }
547 
548 static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
549 {
550     hwaddr hi, lo;
551 
552     /*
553      * The purpose of this check is to guard against invalid "local" access to
554      * the iomem (i.e. controller registers). Thus, we check against the range
555      * covered by the 'bar0' MemoryRegion since that is currently composed of
556      * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however,
557      * that if the device model is ever changed to allow the CMB to be located
558      * in BAR0 as well, then this must be changed.
559      */
560     lo = n->bar0.addr;
561     hi = lo + int128_get64(n->bar0.size);
562 
563     return addr >= lo && addr < hi;
564 }
565 
566 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
567 {
568     hwaddr hi = addr + size - 1;
569     if (hi < addr) {
570         return 1;
571     }
572 
573     if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
574         memcpy(buf, nvme_addr_to_cmb(n, addr), size);
575         return 0;
576     }
577 
578     if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
579         memcpy(buf, nvme_addr_to_pmr(n, addr), size);
580         return 0;
581     }
582 
583     return pci_dma_read(PCI_DEVICE(n), addr, buf, size);
584 }
585 
586 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
587 {
588     hwaddr hi = addr + size - 1;
589     if (hi < addr) {
590         return 1;
591     }
592 
593     if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
594         memcpy(nvme_addr_to_cmb(n, addr), buf, size);
595         return 0;
596     }
597 
598     if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
599         memcpy(nvme_addr_to_pmr(n, addr), buf, size);
600         return 0;
601     }
602 
603     return pci_dma_write(PCI_DEVICE(n), addr, buf, size);
604 }
605 
606 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
607 {
608     return nsid &&
609         (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
610 }
611 
612 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
613 {
614     return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
615 }
616 
617 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
618 {
619     return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
620 }
621 
622 static void nvme_inc_cq_tail(NvmeCQueue *cq)
623 {
624     cq->tail++;
625     if (cq->tail >= cq->size) {
626         cq->tail = 0;
627         cq->phase = !cq->phase;
628     }
629 }
630 
631 static void nvme_inc_sq_head(NvmeSQueue *sq)
632 {
633     sq->head = (sq->head + 1) % sq->size;
634 }
635 
636 static uint8_t nvme_cq_full(NvmeCQueue *cq)
637 {
638     return (cq->tail + 1) % cq->size == cq->head;
639 }
640 
641 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
642 {
643     return sq->head == sq->tail;
644 }
645 
646 static void nvme_irq_check(NvmeCtrl *n)
647 {
648     PCIDevice *pci = PCI_DEVICE(n);
649     uint32_t intms = ldl_le_p(&n->bar.intms);
650 
651     if (msix_enabled(pci)) {
652         return;
653     }
654     if (~intms & n->irq_status) {
655         pci_irq_assert(pci);
656     } else {
657         pci_irq_deassert(pci);
658     }
659 }
660 
661 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
662 {
663     PCIDevice *pci = PCI_DEVICE(n);
664 
665     if (cq->irq_enabled) {
666         if (msix_enabled(pci)) {
667             trace_pci_nvme_irq_msix(cq->vector);
668             msix_notify(pci, cq->vector);
669         } else {
670             trace_pci_nvme_irq_pin();
671             assert(cq->vector < 32);
672             n->irq_status |= 1 << cq->vector;
673             nvme_irq_check(n);
674         }
675     } else {
676         trace_pci_nvme_irq_masked();
677     }
678 }
679 
680 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
681 {
682     if (cq->irq_enabled) {
683         if (msix_enabled(PCI_DEVICE(n))) {
684             return;
685         } else {
686             assert(cq->vector < 32);
687             if (!n->cq_pending) {
688                 n->irq_status &= ~(1 << cq->vector);
689             }
690             nvme_irq_check(n);
691         }
692     }
693 }
694 
695 static void nvme_req_clear(NvmeRequest *req)
696 {
697     req->ns = NULL;
698     req->opaque = NULL;
699     req->aiocb = NULL;
700     memset(&req->cqe, 0x0, sizeof(req->cqe));
701     req->status = NVME_SUCCESS;
702 }
703 
704 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
705 {
706     if (dma) {
707         pci_dma_sglist_init(&sg->qsg, PCI_DEVICE(n), 0);
708         sg->flags = NVME_SG_DMA;
709     } else {
710         qemu_iovec_init(&sg->iov, 0);
711     }
712 
713     sg->flags |= NVME_SG_ALLOC;
714 }
715 
716 static inline void nvme_sg_unmap(NvmeSg *sg)
717 {
718     if (!(sg->flags & NVME_SG_ALLOC)) {
719         return;
720     }
721 
722     if (sg->flags & NVME_SG_DMA) {
723         qemu_sglist_destroy(&sg->qsg);
724     } else {
725         qemu_iovec_destroy(&sg->iov);
726     }
727 
728     memset(sg, 0x0, sizeof(*sg));
729 }
730 
731 /*
732  * When metadata is transferred as extended LBAs, the DPTR mapped into `sg`
733  * holds both data and metadata. This function splits the data and metadata
734  * into two separate QSG/IOVs.
735  */
736 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
737                           NvmeSg *mdata)
738 {
739     NvmeSg *dst = data;
740     uint32_t trans_len, count = ns->lbasz;
741     uint64_t offset = 0;
742     bool dma = sg->flags & NVME_SG_DMA;
743     size_t sge_len;
744     size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
745     int sg_idx = 0;
746 
747     assert(sg->flags & NVME_SG_ALLOC);
748 
749     while (sg_len) {
750         sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
751 
752         trans_len = MIN(sg_len, count);
753         trans_len = MIN(trans_len, sge_len - offset);
754 
755         if (dst) {
756             if (dma) {
757                 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
758                                 trans_len);
759             } else {
760                 qemu_iovec_add(&dst->iov,
761                                sg->iov.iov[sg_idx].iov_base + offset,
762                                trans_len);
763             }
764         }
765 
766         sg_len -= trans_len;
767         count -= trans_len;
768         offset += trans_len;
769 
770         if (count == 0) {
771             dst = (dst == data) ? mdata : data;
772             count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
773         }
774 
775         if (sge_len == offset) {
776             offset = 0;
777             sg_idx++;
778         }
779     }
780 }
781 
782 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
783                                   size_t len)
784 {
785     if (!len) {
786         return NVME_SUCCESS;
787     }
788 
789     trace_pci_nvme_map_addr_cmb(addr, len);
790 
791     if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
792         return NVME_DATA_TRAS_ERROR;
793     }
794 
795     qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
796 
797     return NVME_SUCCESS;
798 }
799 
800 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
801                                   size_t len)
802 {
803     if (!len) {
804         return NVME_SUCCESS;
805     }
806 
807     if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
808         return NVME_DATA_TRAS_ERROR;
809     }
810 
811     qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
812 
813     return NVME_SUCCESS;
814 }
815 
816 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
817 {
818     bool cmb = false, pmr = false;
819 
820     if (!len) {
821         return NVME_SUCCESS;
822     }
823 
824     trace_pci_nvme_map_addr(addr, len);
825 
826     if (nvme_addr_is_iomem(n, addr)) {
827         return NVME_DATA_TRAS_ERROR;
828     }
829 
830     if (nvme_addr_is_cmb(n, addr)) {
831         cmb = true;
832     } else if (nvme_addr_is_pmr(n, addr)) {
833         pmr = true;
834     }
835 
836     if (cmb || pmr) {
837         if (sg->flags & NVME_SG_DMA) {
838             return NVME_INVALID_USE_OF_CMB | NVME_DNR;
839         }
840 
841         if (sg->iov.niov + 1 > IOV_MAX) {
842             goto max_mappings_exceeded;
843         }
844 
845         if (cmb) {
846             return nvme_map_addr_cmb(n, &sg->iov, addr, len);
847         } else {
848             return nvme_map_addr_pmr(n, &sg->iov, addr, len);
849         }
850     }
851 
852     if (!(sg->flags & NVME_SG_DMA)) {
853         return NVME_INVALID_USE_OF_CMB | NVME_DNR;
854     }
855 
856     if (sg->qsg.nsg + 1 > IOV_MAX) {
857         goto max_mappings_exceeded;
858     }
859 
860     qemu_sglist_add(&sg->qsg, addr, len);
861 
862     return NVME_SUCCESS;
863 
864 max_mappings_exceeded:
865     NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
866                    "number of mappings exceed 1024");
867     return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
868 }
869 
870 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
871 {
872     return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
873 }
874 
875 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
876                              uint64_t prp2, uint32_t len)
877 {
878     hwaddr trans_len = n->page_size - (prp1 % n->page_size);
879     trans_len = MIN(len, trans_len);
880     int num_prps = (len >> n->page_bits) + 1;
881     uint16_t status;
882     int ret;
883 
884     trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
885 
886     nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
887 
888     status = nvme_map_addr(n, sg, prp1, trans_len);
889     if (status) {
890         goto unmap;
891     }
892 
893     len -= trans_len;
894     if (len) {
895         if (len > n->page_size) {
896             g_autofree uint64_t *prp_list = g_new(uint64_t, n->max_prp_ents);
897             uint32_t nents, prp_trans;
898             int i = 0;
899 
900             /*
901              * The first PRP list entry, pointed to by PRP2 may contain offset.
902              * Hence, we need to calculate the number of entries in based on
903              * that offset.
904              */
905             nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
906             prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
907             ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
908             if (ret) {
909                 trace_pci_nvme_err_addr_read(prp2);
910                 status = NVME_DATA_TRAS_ERROR;
911                 goto unmap;
912             }
913             while (len != 0) {
914                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
915 
916                 if (i == nents - 1 && len > n->page_size) {
917                     if (unlikely(prp_ent & (n->page_size - 1))) {
918                         trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
919                         status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
920                         goto unmap;
921                     }
922 
923                     i = 0;
924                     nents = (len + n->page_size - 1) >> n->page_bits;
925                     nents = MIN(nents, n->max_prp_ents);
926                     prp_trans = nents * sizeof(uint64_t);
927                     ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
928                                          prp_trans);
929                     if (ret) {
930                         trace_pci_nvme_err_addr_read(prp_ent);
931                         status = NVME_DATA_TRAS_ERROR;
932                         goto unmap;
933                     }
934                     prp_ent = le64_to_cpu(prp_list[i]);
935                 }
936 
937                 if (unlikely(prp_ent & (n->page_size - 1))) {
938                     trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
939                     status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
940                     goto unmap;
941                 }
942 
943                 trans_len = MIN(len, n->page_size);
944                 status = nvme_map_addr(n, sg, prp_ent, trans_len);
945                 if (status) {
946                     goto unmap;
947                 }
948 
949                 len -= trans_len;
950                 i++;
951             }
952         } else {
953             if (unlikely(prp2 & (n->page_size - 1))) {
954                 trace_pci_nvme_err_invalid_prp2_align(prp2);
955                 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
956                 goto unmap;
957             }
958             status = nvme_map_addr(n, sg, prp2, len);
959             if (status) {
960                 goto unmap;
961             }
962         }
963     }
964 
965     return NVME_SUCCESS;
966 
967 unmap:
968     nvme_sg_unmap(sg);
969     return status;
970 }
971 
972 /*
973  * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
974  * number of bytes mapped in len.
975  */
976 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
977                                   NvmeSglDescriptor *segment, uint64_t nsgld,
978                                   size_t *len, NvmeCmd *cmd)
979 {
980     dma_addr_t addr, trans_len;
981     uint32_t dlen;
982     uint16_t status;
983 
984     for (int i = 0; i < nsgld; i++) {
985         uint8_t type = NVME_SGL_TYPE(segment[i].type);
986 
987         switch (type) {
988         case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
989             break;
990         case NVME_SGL_DESCR_TYPE_SEGMENT:
991         case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
992             return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
993         default:
994             return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
995         }
996 
997         dlen = le32_to_cpu(segment[i].len);
998 
999         if (!dlen) {
1000             continue;
1001         }
1002 
1003         if (*len == 0) {
1004             /*
1005              * All data has been mapped, but the SGL contains additional
1006              * segments and/or descriptors. The controller might accept
1007              * ignoring the rest of the SGL.
1008              */
1009             uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
1010             if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
1011                 break;
1012             }
1013 
1014             trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
1015             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1016         }
1017 
1018         trans_len = MIN(*len, dlen);
1019 
1020         addr = le64_to_cpu(segment[i].addr);
1021 
1022         if (UINT64_MAX - addr < dlen) {
1023             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1024         }
1025 
1026         status = nvme_map_addr(n, sg, addr, trans_len);
1027         if (status) {
1028             return status;
1029         }
1030 
1031         *len -= trans_len;
1032     }
1033 
1034     return NVME_SUCCESS;
1035 }
1036 
1037 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
1038                              size_t len, NvmeCmd *cmd)
1039 {
1040     /*
1041      * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
1042      * dynamically allocating a potentially huge SGL. The spec allows the SGL
1043      * to be larger (as in number of bytes required to describe the SGL
1044      * descriptors and segment chain) than the command transfer size, so it is
1045      * not bounded by MDTS.
1046      */
1047 #define SEG_CHUNK_SIZE 256
1048 
1049     NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
1050     uint64_t nsgld;
1051     uint32_t seg_len;
1052     uint16_t status;
1053     hwaddr addr;
1054     int ret;
1055 
1056     sgld = &sgl;
1057     addr = le64_to_cpu(sgl.addr);
1058 
1059     trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
1060 
1061     nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
1062 
1063     /*
1064      * If the entire transfer can be described with a single data block it can
1065      * be mapped directly.
1066      */
1067     if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1068         status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
1069         if (status) {
1070             goto unmap;
1071         }
1072 
1073         goto out;
1074     }
1075 
1076     for (;;) {
1077         switch (NVME_SGL_TYPE(sgld->type)) {
1078         case NVME_SGL_DESCR_TYPE_SEGMENT:
1079         case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1080             break;
1081         default:
1082             return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1083         }
1084 
1085         seg_len = le32_to_cpu(sgld->len);
1086 
1087         /* check the length of the (Last) Segment descriptor */
1088         if (!seg_len || seg_len & 0xf) {
1089             return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1090         }
1091 
1092         if (UINT64_MAX - addr < seg_len) {
1093             return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1094         }
1095 
1096         nsgld = seg_len / sizeof(NvmeSglDescriptor);
1097 
1098         while (nsgld > SEG_CHUNK_SIZE) {
1099             if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
1100                 trace_pci_nvme_err_addr_read(addr);
1101                 status = NVME_DATA_TRAS_ERROR;
1102                 goto unmap;
1103             }
1104 
1105             status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
1106                                        &len, cmd);
1107             if (status) {
1108                 goto unmap;
1109             }
1110 
1111             nsgld -= SEG_CHUNK_SIZE;
1112             addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
1113         }
1114 
1115         ret = nvme_addr_read(n, addr, segment, nsgld *
1116                              sizeof(NvmeSglDescriptor));
1117         if (ret) {
1118             trace_pci_nvme_err_addr_read(addr);
1119             status = NVME_DATA_TRAS_ERROR;
1120             goto unmap;
1121         }
1122 
1123         last_sgld = &segment[nsgld - 1];
1124 
1125         /*
1126          * If the segment ends with a Data Block, then we are done.
1127          */
1128         if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1129             status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
1130             if (status) {
1131                 goto unmap;
1132             }
1133 
1134             goto out;
1135         }
1136 
1137         /*
1138          * If the last descriptor was not a Data Block, then the current
1139          * segment must not be a Last Segment.
1140          */
1141         if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
1142             status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1143             goto unmap;
1144         }
1145 
1146         sgld = last_sgld;
1147         addr = le64_to_cpu(sgld->addr);
1148 
1149         /*
1150          * Do not map the last descriptor; it will be a Segment or Last Segment
1151          * descriptor and is handled by the next iteration.
1152          */
1153         status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
1154         if (status) {
1155             goto unmap;
1156         }
1157     }
1158 
1159 out:
1160     /* if there is any residual left in len, the SGL was too short */
1161     if (len) {
1162         status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1163         goto unmap;
1164     }
1165 
1166     return NVME_SUCCESS;
1167 
1168 unmap:
1169     nvme_sg_unmap(sg);
1170     return status;
1171 }
1172 
1173 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1174                        NvmeCmd *cmd)
1175 {
1176     uint64_t prp1, prp2;
1177 
1178     switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
1179     case NVME_PSDT_PRP:
1180         prp1 = le64_to_cpu(cmd->dptr.prp1);
1181         prp2 = le64_to_cpu(cmd->dptr.prp2);
1182 
1183         return nvme_map_prp(n, sg, prp1, prp2, len);
1184     case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
1185     case NVME_PSDT_SGL_MPTR_SGL:
1186         return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
1187     default:
1188         return NVME_INVALID_FIELD;
1189     }
1190 }
1191 
1192 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1193                               NvmeCmd *cmd)
1194 {
1195     int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
1196     hwaddr mptr = le64_to_cpu(cmd->mptr);
1197     uint16_t status;
1198 
1199     if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1200         NvmeSglDescriptor sgl;
1201 
1202         if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1203             return NVME_DATA_TRAS_ERROR;
1204         }
1205 
1206         status = nvme_map_sgl(n, sg, sgl, len, cmd);
1207         if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1208             status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1209         }
1210 
1211         return status;
1212     }
1213 
1214     nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1215     status = nvme_map_addr(n, sg, mptr, len);
1216     if (status) {
1217         nvme_sg_unmap(sg);
1218     }
1219 
1220     return status;
1221 }
1222 
1223 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1224 {
1225     NvmeNamespace *ns = req->ns;
1226     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1227     bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1228     bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1229     size_t len = nvme_l2b(ns, nlb);
1230     uint16_t status;
1231 
1232     if (nvme_ns_ext(ns) &&
1233         !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1234         NvmeSg sg;
1235 
1236         len += nvme_m2b(ns, nlb);
1237 
1238         status = nvme_map_dptr(n, &sg, len, &req->cmd);
1239         if (status) {
1240             return status;
1241         }
1242 
1243         nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1244         nvme_sg_split(&sg, ns, &req->sg, NULL);
1245         nvme_sg_unmap(&sg);
1246 
1247         return NVME_SUCCESS;
1248     }
1249 
1250     return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1251 }
1252 
1253 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1254 {
1255     NvmeNamespace *ns = req->ns;
1256     size_t len = nvme_m2b(ns, nlb);
1257     uint16_t status;
1258 
1259     if (nvme_ns_ext(ns)) {
1260         NvmeSg sg;
1261 
1262         len += nvme_l2b(ns, nlb);
1263 
1264         status = nvme_map_dptr(n, &sg, len, &req->cmd);
1265         if (status) {
1266             return status;
1267         }
1268 
1269         nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1270         nvme_sg_split(&sg, ns, NULL, &req->sg);
1271         nvme_sg_unmap(&sg);
1272 
1273         return NVME_SUCCESS;
1274     }
1275 
1276     return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1277 }
1278 
1279 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1280                                     uint32_t len, uint32_t bytes,
1281                                     int32_t skip_bytes, int64_t offset,
1282                                     NvmeTxDirection dir)
1283 {
1284     hwaddr addr;
1285     uint32_t trans_len, count = bytes;
1286     bool dma = sg->flags & NVME_SG_DMA;
1287     int64_t sge_len;
1288     int sg_idx = 0;
1289     int ret;
1290 
1291     assert(sg->flags & NVME_SG_ALLOC);
1292 
1293     while (len) {
1294         sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1295 
1296         if (sge_len - offset < 0) {
1297             offset -= sge_len;
1298             sg_idx++;
1299             continue;
1300         }
1301 
1302         if (sge_len == offset) {
1303             offset = 0;
1304             sg_idx++;
1305             continue;
1306         }
1307 
1308         trans_len = MIN(len, count);
1309         trans_len = MIN(trans_len, sge_len - offset);
1310 
1311         if (dma) {
1312             addr = sg->qsg.sg[sg_idx].base + offset;
1313         } else {
1314             addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1315         }
1316 
1317         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1318             ret = nvme_addr_read(n, addr, ptr, trans_len);
1319         } else {
1320             ret = nvme_addr_write(n, addr, ptr, trans_len);
1321         }
1322 
1323         if (ret) {
1324             return NVME_DATA_TRAS_ERROR;
1325         }
1326 
1327         ptr += trans_len;
1328         len -= trans_len;
1329         count -= trans_len;
1330         offset += trans_len;
1331 
1332         if (count == 0) {
1333             count = bytes;
1334             offset += skip_bytes;
1335         }
1336     }
1337 
1338     return NVME_SUCCESS;
1339 }
1340 
1341 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
1342                         NvmeTxDirection dir)
1343 {
1344     assert(sg->flags & NVME_SG_ALLOC);
1345 
1346     if (sg->flags & NVME_SG_DMA) {
1347         const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1348         dma_addr_t residual;
1349 
1350         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1351             dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1352         } else {
1353             dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1354         }
1355 
1356         if (unlikely(residual)) {
1357             trace_pci_nvme_err_invalid_dma();
1358             return NVME_INVALID_FIELD | NVME_DNR;
1359         }
1360     } else {
1361         size_t bytes;
1362 
1363         if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1364             bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1365         } else {
1366             bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1367         }
1368 
1369         if (unlikely(bytes != len)) {
1370             trace_pci_nvme_err_invalid_dma();
1371             return NVME_INVALID_FIELD | NVME_DNR;
1372         }
1373     }
1374 
1375     return NVME_SUCCESS;
1376 }
1377 
1378 static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
1379                                 NvmeRequest *req)
1380 {
1381     uint16_t status;
1382 
1383     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1384     if (status) {
1385         return status;
1386     }
1387 
1388     return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1389 }
1390 
1391 static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
1392                                 NvmeRequest *req)
1393 {
1394     uint16_t status;
1395 
1396     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1397     if (status) {
1398         return status;
1399     }
1400 
1401     return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1402 }
1403 
1404 uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
1405                           NvmeTxDirection dir, NvmeRequest *req)
1406 {
1407     NvmeNamespace *ns = req->ns;
1408     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1409     bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1410     bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1411 
1412     if (nvme_ns_ext(ns) &&
1413         !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1414         return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1415                                    ns->lbaf.ms, 0, dir);
1416     }
1417 
1418     return nvme_tx(n, &req->sg, ptr, len, dir);
1419 }
1420 
1421 uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
1422                            NvmeTxDirection dir, NvmeRequest *req)
1423 {
1424     NvmeNamespace *ns = req->ns;
1425     uint16_t status;
1426 
1427     if (nvme_ns_ext(ns)) {
1428         return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1429                                    ns->lbasz, ns->lbasz, dir);
1430     }
1431 
1432     nvme_sg_unmap(&req->sg);
1433 
1434     status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1435     if (status) {
1436         return status;
1437     }
1438 
1439     return nvme_tx(n, &req->sg, ptr, len, dir);
1440 }
1441 
1442 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1443                                  uint32_t align, BlockCompletionFunc *cb,
1444                                  NvmeRequest *req)
1445 {
1446     assert(req->sg.flags & NVME_SG_ALLOC);
1447 
1448     if (req->sg.flags & NVME_SG_DMA) {
1449         req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req);
1450     } else {
1451         req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1452     }
1453 }
1454 
1455 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1456                                   uint32_t align, BlockCompletionFunc *cb,
1457                                   NvmeRequest *req)
1458 {
1459     assert(req->sg.flags & NVME_SG_ALLOC);
1460 
1461     if (req->sg.flags & NVME_SG_DMA) {
1462         req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req);
1463     } else {
1464         req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1465     }
1466 }
1467 
1468 static void nvme_update_cq_eventidx(const NvmeCQueue *cq)
1469 {
1470     trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head);
1471 
1472     stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head,
1473                    MEMTXATTRS_UNSPECIFIED);
1474 }
1475 
1476 static void nvme_update_cq_head(NvmeCQueue *cq)
1477 {
1478     ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, &cq->head,
1479                    MEMTXATTRS_UNSPECIFIED);
1480 
1481     trace_pci_nvme_update_cq_head(cq->cqid, cq->head);
1482 }
1483 
1484 static void nvme_post_cqes(void *opaque)
1485 {
1486     NvmeCQueue *cq = opaque;
1487     NvmeCtrl *n = cq->ctrl;
1488     NvmeRequest *req, *next;
1489     bool pending = cq->head != cq->tail;
1490     int ret;
1491 
1492     QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1493         NvmeSQueue *sq;
1494         hwaddr addr;
1495 
1496         if (n->dbbuf_enabled) {
1497             nvme_update_cq_eventidx(cq);
1498             nvme_update_cq_head(cq);
1499         }
1500 
1501         if (nvme_cq_full(cq)) {
1502             break;
1503         }
1504 
1505         sq = req->sq;
1506         req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1507         req->cqe.sq_id = cpu_to_le16(sq->sqid);
1508         req->cqe.sq_head = cpu_to_le16(sq->head);
1509         addr = cq->dma_addr + (cq->tail << NVME_CQES);
1510         ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)&req->cqe,
1511                             sizeof(req->cqe));
1512         if (ret) {
1513             trace_pci_nvme_err_addr_write(addr);
1514             trace_pci_nvme_err_cfs();
1515             stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1516             break;
1517         }
1518         QTAILQ_REMOVE(&cq->req_list, req, entry);
1519         nvme_inc_cq_tail(cq);
1520         nvme_sg_unmap(&req->sg);
1521         QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1522     }
1523     if (cq->tail != cq->head) {
1524         if (cq->irq_enabled && !pending) {
1525             n->cq_pending++;
1526         }
1527 
1528         nvme_irq_assert(n, cq);
1529     }
1530 }
1531 
1532 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1533 {
1534     assert(cq->cqid == req->sq->cqid);
1535     trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1536                                           le32_to_cpu(req->cqe.result),
1537                                           le32_to_cpu(req->cqe.dw1),
1538                                           req->status);
1539 
1540     if (req->status) {
1541         trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1542                                       req->status, req->cmd.opcode);
1543     }
1544 
1545     QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1546     QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1547 
1548     qemu_bh_schedule(cq->bh);
1549 }
1550 
1551 static void nvme_process_aers(void *opaque)
1552 {
1553     NvmeCtrl *n = opaque;
1554     NvmeAsyncEvent *event, *next;
1555 
1556     trace_pci_nvme_process_aers(n->aer_queued);
1557 
1558     QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1559         NvmeRequest *req;
1560         NvmeAerResult *result;
1561 
1562         /* can't post cqe if there is nothing to complete */
1563         if (!n->outstanding_aers) {
1564             trace_pci_nvme_no_outstanding_aers();
1565             break;
1566         }
1567 
1568         /* ignore if masked (cqe posted, but event not cleared) */
1569         if (n->aer_mask & (1 << event->result.event_type)) {
1570             trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1571             continue;
1572         }
1573 
1574         QTAILQ_REMOVE(&n->aer_queue, event, entry);
1575         n->aer_queued--;
1576 
1577         n->aer_mask |= 1 << event->result.event_type;
1578         n->outstanding_aers--;
1579 
1580         req = n->aer_reqs[n->outstanding_aers];
1581 
1582         result = (NvmeAerResult *) &req->cqe.result;
1583         result->event_type = event->result.event_type;
1584         result->event_info = event->result.event_info;
1585         result->log_page = event->result.log_page;
1586         g_free(event);
1587 
1588         trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1589                                     result->log_page);
1590 
1591         nvme_enqueue_req_completion(&n->admin_cq, req);
1592     }
1593 }
1594 
1595 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1596                                uint8_t event_info, uint8_t log_page)
1597 {
1598     NvmeAsyncEvent *event;
1599 
1600     trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1601 
1602     if (n->aer_queued == n->params.aer_max_queued) {
1603         trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1604         return;
1605     }
1606 
1607     event = g_new(NvmeAsyncEvent, 1);
1608     event->result = (NvmeAerResult) {
1609         .event_type = event_type,
1610         .event_info = event_info,
1611         .log_page   = log_page,
1612     };
1613 
1614     QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1615     n->aer_queued++;
1616 
1617     nvme_process_aers(n);
1618 }
1619 
1620 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1621 {
1622     uint8_t aer_info;
1623 
1624     /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1625     if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1626         return;
1627     }
1628 
1629     switch (event) {
1630     case NVME_SMART_SPARE:
1631         aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1632         break;
1633     case NVME_SMART_TEMPERATURE:
1634         aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1635         break;
1636     case NVME_SMART_RELIABILITY:
1637     case NVME_SMART_MEDIA_READ_ONLY:
1638     case NVME_SMART_FAILED_VOLATILE_MEDIA:
1639     case NVME_SMART_PMR_UNRELIABLE:
1640         aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1641         break;
1642     default:
1643         return;
1644     }
1645 
1646     nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1647 }
1648 
1649 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1650 {
1651     n->aer_mask &= ~(1 << event_type);
1652     if (!QTAILQ_EMPTY(&n->aer_queue)) {
1653         nvme_process_aers(n);
1654     }
1655 }
1656 
1657 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1658 {
1659     uint8_t mdts = n->params.mdts;
1660 
1661     if (mdts && len > n->page_size << mdts) {
1662         trace_pci_nvme_err_mdts(len);
1663         return NVME_INVALID_FIELD | NVME_DNR;
1664     }
1665 
1666     return NVME_SUCCESS;
1667 }
1668 
1669 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1670                                          uint32_t nlb)
1671 {
1672     uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1673 
1674     if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1675         trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1676         return NVME_LBA_RANGE | NVME_DNR;
1677     }
1678 
1679     return NVME_SUCCESS;
1680 }
1681 
1682 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1683                                  uint32_t nlb, int flags)
1684 {
1685     BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1686 
1687     int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1688     int64_t offset = nvme_l2b(ns, slba);
1689     int ret;
1690 
1691     /*
1692      * `pnum` holds the number of bytes after offset that shares the same
1693      * allocation status as the byte at offset. If `pnum` is different from
1694      * `bytes`, we should check the allocation status of the next range and
1695      * continue this until all bytes have been checked.
1696      */
1697     do {
1698         bytes -= pnum;
1699 
1700         ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1701         if (ret < 0) {
1702             return ret;
1703         }
1704 
1705 
1706         trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1707                                     !!(ret & BDRV_BLOCK_ZERO));
1708 
1709         if (!(ret & flags)) {
1710             return 1;
1711         }
1712 
1713         offset += pnum;
1714     } while (pnum != bytes);
1715 
1716     return 0;
1717 }
1718 
1719 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1720                                  uint32_t nlb)
1721 {
1722     int ret;
1723     Error *err = NULL;
1724 
1725     ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1726     if (ret) {
1727         if (ret < 0) {
1728             error_setg_errno(&err, -ret, "unable to get block status");
1729             error_report_err(err);
1730 
1731             return NVME_INTERNAL_DEV_ERROR;
1732         }
1733 
1734         return NVME_DULB;
1735     }
1736 
1737     return NVME_SUCCESS;
1738 }
1739 
1740 static void nvme_aio_err(NvmeRequest *req, int ret)
1741 {
1742     uint16_t status = NVME_SUCCESS;
1743     Error *local_err = NULL;
1744 
1745     switch (req->cmd.opcode) {
1746     case NVME_CMD_READ:
1747         status = NVME_UNRECOVERED_READ;
1748         break;
1749     case NVME_CMD_FLUSH:
1750     case NVME_CMD_WRITE:
1751     case NVME_CMD_WRITE_ZEROES:
1752     case NVME_CMD_ZONE_APPEND:
1753     case NVME_CMD_COPY:
1754         status = NVME_WRITE_FAULT;
1755         break;
1756     default:
1757         status = NVME_INTERNAL_DEV_ERROR;
1758         break;
1759     }
1760 
1761     if (ret == -ECANCELED) {
1762         status = NVME_CMD_ABORT_REQ;
1763     }
1764 
1765     trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1766 
1767     error_setg_errno(&local_err, -ret, "aio failed");
1768     error_report_err(local_err);
1769 
1770     /*
1771      * Set the command status code to the first encountered error but allow a
1772      * subsequent Internal Device Error to trump it.
1773      */
1774     if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1775         return;
1776     }
1777 
1778     req->status = status;
1779 }
1780 
1781 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1782 {
1783     return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1784                                     slba / ns->zone_size;
1785 }
1786 
1787 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1788 {
1789     uint32_t zone_idx = nvme_zone_idx(ns, slba);
1790 
1791     if (zone_idx >= ns->num_zones) {
1792         return NULL;
1793     }
1794 
1795     return &ns->zone_array[zone_idx];
1796 }
1797 
1798 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1799 {
1800     uint64_t zslba = zone->d.zslba;
1801 
1802     switch (nvme_get_zone_state(zone)) {
1803     case NVME_ZONE_STATE_EMPTY:
1804     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1805     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1806     case NVME_ZONE_STATE_CLOSED:
1807         return NVME_SUCCESS;
1808     case NVME_ZONE_STATE_FULL:
1809         trace_pci_nvme_err_zone_is_full(zslba);
1810         return NVME_ZONE_FULL;
1811     case NVME_ZONE_STATE_OFFLINE:
1812         trace_pci_nvme_err_zone_is_offline(zslba);
1813         return NVME_ZONE_OFFLINE;
1814     case NVME_ZONE_STATE_READ_ONLY:
1815         trace_pci_nvme_err_zone_is_read_only(zslba);
1816         return NVME_ZONE_READ_ONLY;
1817     default:
1818         assert(false);
1819     }
1820 
1821     return NVME_INTERNAL_DEV_ERROR;
1822 }
1823 
1824 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1825                                       uint64_t slba, uint32_t nlb)
1826 {
1827     uint64_t zcap = nvme_zone_wr_boundary(zone);
1828     uint16_t status;
1829 
1830     status = nvme_check_zone_state_for_write(zone);
1831     if (status) {
1832         return status;
1833     }
1834 
1835     if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1836         uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
1837 
1838         if (slba < zone->w_ptr || slba + nlb > ezrwa) {
1839             trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
1840             return NVME_ZONE_INVALID_WRITE;
1841         }
1842     } else {
1843         if (unlikely(slba != zone->w_ptr)) {
1844             trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
1845                                                zone->w_ptr);
1846             return NVME_ZONE_INVALID_WRITE;
1847         }
1848     }
1849 
1850     if (unlikely((slba + nlb) > zcap)) {
1851         trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1852         return NVME_ZONE_BOUNDARY_ERROR;
1853     }
1854 
1855     return NVME_SUCCESS;
1856 }
1857 
1858 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1859 {
1860     switch (nvme_get_zone_state(zone)) {
1861     case NVME_ZONE_STATE_EMPTY:
1862     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1863     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1864     case NVME_ZONE_STATE_FULL:
1865     case NVME_ZONE_STATE_CLOSED:
1866     case NVME_ZONE_STATE_READ_ONLY:
1867         return NVME_SUCCESS;
1868     case NVME_ZONE_STATE_OFFLINE:
1869         trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1870         return NVME_ZONE_OFFLINE;
1871     default:
1872         assert(false);
1873     }
1874 
1875     return NVME_INTERNAL_DEV_ERROR;
1876 }
1877 
1878 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1879                                      uint32_t nlb)
1880 {
1881     NvmeZone *zone;
1882     uint64_t bndry, end;
1883     uint16_t status;
1884 
1885     zone = nvme_get_zone_by_slba(ns, slba);
1886     assert(zone);
1887 
1888     bndry = nvme_zone_rd_boundary(ns, zone);
1889     end = slba + nlb;
1890 
1891     status = nvme_check_zone_state_for_read(zone);
1892     if (status) {
1893         ;
1894     } else if (unlikely(end > bndry)) {
1895         if (!ns->params.cross_zone_read) {
1896             status = NVME_ZONE_BOUNDARY_ERROR;
1897         } else {
1898             /*
1899              * Read across zone boundary - check that all subsequent
1900              * zones that are being read have an appropriate state.
1901              */
1902             do {
1903                 zone++;
1904                 status = nvme_check_zone_state_for_read(zone);
1905                 if (status) {
1906                     break;
1907                 }
1908             } while (end > nvme_zone_rd_boundary(ns, zone));
1909         }
1910     }
1911 
1912     return status;
1913 }
1914 
1915 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1916 {
1917     switch (nvme_get_zone_state(zone)) {
1918     case NVME_ZONE_STATE_FULL:
1919         return NVME_SUCCESS;
1920 
1921     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1922     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1923         nvme_aor_dec_open(ns);
1924         /* fallthrough */
1925     case NVME_ZONE_STATE_CLOSED:
1926         nvme_aor_dec_active(ns);
1927 
1928         if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1929             zone->d.za &= ~NVME_ZA_ZRWA_VALID;
1930             if (ns->params.numzrwa) {
1931                 ns->zns.numzrwa++;
1932             }
1933         }
1934 
1935         /* fallthrough */
1936     case NVME_ZONE_STATE_EMPTY:
1937         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1938         return NVME_SUCCESS;
1939 
1940     default:
1941         return NVME_ZONE_INVAL_TRANSITION;
1942     }
1943 }
1944 
1945 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1946 {
1947     switch (nvme_get_zone_state(zone)) {
1948     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1949     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1950         nvme_aor_dec_open(ns);
1951         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1952         /* fall through */
1953     case NVME_ZONE_STATE_CLOSED:
1954         return NVME_SUCCESS;
1955 
1956     default:
1957         return NVME_ZONE_INVAL_TRANSITION;
1958     }
1959 }
1960 
1961 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1962 {
1963     switch (nvme_get_zone_state(zone)) {
1964     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1965     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1966         nvme_aor_dec_open(ns);
1967         /* fallthrough */
1968     case NVME_ZONE_STATE_CLOSED:
1969         nvme_aor_dec_active(ns);
1970 
1971         if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1972             if (ns->params.numzrwa) {
1973                 ns->zns.numzrwa++;
1974             }
1975         }
1976 
1977         /* fallthrough */
1978     case NVME_ZONE_STATE_FULL:
1979         zone->w_ptr = zone->d.zslba;
1980         zone->d.wp = zone->w_ptr;
1981         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1982         /* fallthrough */
1983     case NVME_ZONE_STATE_EMPTY:
1984         return NVME_SUCCESS;
1985 
1986     default:
1987         return NVME_ZONE_INVAL_TRANSITION;
1988     }
1989 }
1990 
1991 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1992 {
1993     NvmeZone *zone;
1994 
1995     if (ns->params.max_open_zones &&
1996         ns->nr_open_zones == ns->params.max_open_zones) {
1997         zone = QTAILQ_FIRST(&ns->imp_open_zones);
1998         if (zone) {
1999             /*
2000              * Automatically close this implicitly open zone.
2001              */
2002             QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
2003             nvme_zrm_close(ns, zone);
2004         }
2005     }
2006 }
2007 
2008 enum {
2009     NVME_ZRM_AUTO = 1 << 0,
2010     NVME_ZRM_ZRWA = 1 << 1,
2011 };
2012 
2013 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
2014                                     NvmeZone *zone, int flags)
2015 {
2016     int act = 0;
2017     uint16_t status;
2018 
2019     switch (nvme_get_zone_state(zone)) {
2020     case NVME_ZONE_STATE_EMPTY:
2021         act = 1;
2022 
2023         /* fallthrough */
2024 
2025     case NVME_ZONE_STATE_CLOSED:
2026         if (n->params.auto_transition_zones) {
2027             nvme_zrm_auto_transition_zone(ns);
2028         }
2029         status = nvme_zns_check_resources(ns, act, 1,
2030                                           (flags & NVME_ZRM_ZRWA) ? 1 : 0);
2031         if (status) {
2032             return status;
2033         }
2034 
2035         if (act) {
2036             nvme_aor_inc_active(ns);
2037         }
2038 
2039         nvme_aor_inc_open(ns);
2040 
2041         if (flags & NVME_ZRM_AUTO) {
2042             nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
2043             return NVME_SUCCESS;
2044         }
2045 
2046         /* fallthrough */
2047 
2048     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2049         if (flags & NVME_ZRM_AUTO) {
2050             return NVME_SUCCESS;
2051         }
2052 
2053         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
2054 
2055         /* fallthrough */
2056 
2057     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2058         if (flags & NVME_ZRM_ZRWA) {
2059             ns->zns.numzrwa--;
2060 
2061             zone->d.za |= NVME_ZA_ZRWA_VALID;
2062         }
2063 
2064         return NVME_SUCCESS;
2065 
2066     default:
2067         return NVME_ZONE_INVAL_TRANSITION;
2068     }
2069 }
2070 
2071 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
2072                                      NvmeZone *zone)
2073 {
2074     return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
2075 }
2076 
2077 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
2078                                  uint32_t nlb)
2079 {
2080     zone->d.wp += nlb;
2081 
2082     if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
2083         nvme_zrm_finish(ns, zone);
2084     }
2085 }
2086 
2087 static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone,
2088                                            uint32_t nlbc)
2089 {
2090     uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg);
2091 
2092     nlbc = nzrwafgs * ns->zns.zrwafg;
2093 
2094     trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc);
2095 
2096     zone->w_ptr += nlbc;
2097 
2098     nvme_advance_zone_wp(ns, zone, nlbc);
2099 }
2100 
2101 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
2102 {
2103     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2104     NvmeZone *zone;
2105     uint64_t slba;
2106     uint32_t nlb;
2107 
2108     slba = le64_to_cpu(rw->slba);
2109     nlb = le16_to_cpu(rw->nlb) + 1;
2110     zone = nvme_get_zone_by_slba(ns, slba);
2111     assert(zone);
2112 
2113     if (zone->d.za & NVME_ZA_ZRWA_VALID) {
2114         uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1;
2115         uint64_t elba = slba + nlb - 1;
2116 
2117         if (elba > ezrwa) {
2118             nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa);
2119         }
2120 
2121         return;
2122     }
2123 
2124     nvme_advance_zone_wp(ns, zone, nlb);
2125 }
2126 
2127 static inline bool nvme_is_write(NvmeRequest *req)
2128 {
2129     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2130 
2131     return rw->opcode == NVME_CMD_WRITE ||
2132            rw->opcode == NVME_CMD_ZONE_APPEND ||
2133            rw->opcode == NVME_CMD_WRITE_ZEROES;
2134 }
2135 
2136 static void nvme_misc_cb(void *opaque, int ret)
2137 {
2138     NvmeRequest *req = opaque;
2139 
2140     trace_pci_nvme_misc_cb(nvme_cid(req));
2141 
2142     if (ret) {
2143         nvme_aio_err(req, ret);
2144     }
2145 
2146     nvme_enqueue_req_completion(nvme_cq(req), req);
2147 }
2148 
2149 void nvme_rw_complete_cb(void *opaque, int ret)
2150 {
2151     NvmeRequest *req = opaque;
2152     NvmeNamespace *ns = req->ns;
2153     BlockBackend *blk = ns->blkconf.blk;
2154     BlockAcctCookie *acct = &req->acct;
2155     BlockAcctStats *stats = blk_get_stats(blk);
2156 
2157     trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
2158 
2159     if (ret) {
2160         block_acct_failed(stats, acct);
2161         nvme_aio_err(req, ret);
2162     } else {
2163         block_acct_done(stats, acct);
2164     }
2165 
2166     if (ns->params.zoned && nvme_is_write(req)) {
2167         nvme_finalize_zoned_write(ns, req);
2168     }
2169 
2170     nvme_enqueue_req_completion(nvme_cq(req), req);
2171 }
2172 
2173 static void nvme_rw_cb(void *opaque, int ret)
2174 {
2175     NvmeRequest *req = opaque;
2176     NvmeNamespace *ns = req->ns;
2177 
2178     BlockBackend *blk = ns->blkconf.blk;
2179 
2180     trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
2181 
2182     if (ret) {
2183         goto out;
2184     }
2185 
2186     if (ns->lbaf.ms) {
2187         NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2188         uint64_t slba = le64_to_cpu(rw->slba);
2189         uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2190         uint64_t offset = nvme_moff(ns, slba);
2191 
2192         if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
2193             size_t mlen = nvme_m2b(ns, nlb);
2194 
2195             req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
2196                                                BDRV_REQ_MAY_UNMAP,
2197                                                nvme_rw_complete_cb, req);
2198             return;
2199         }
2200 
2201         if (nvme_ns_ext(ns) || req->cmd.mptr) {
2202             uint16_t status;
2203 
2204             nvme_sg_unmap(&req->sg);
2205             status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
2206             if (status) {
2207                 ret = -EFAULT;
2208                 goto out;
2209             }
2210 
2211             if (req->cmd.opcode == NVME_CMD_READ) {
2212                 return nvme_blk_read(blk, offset, 1, nvme_rw_complete_cb, req);
2213             }
2214 
2215             return nvme_blk_write(blk, offset, 1, nvme_rw_complete_cb, req);
2216         }
2217     }
2218 
2219 out:
2220     nvme_rw_complete_cb(req, ret);
2221 }
2222 
2223 static void nvme_verify_cb(void *opaque, int ret)
2224 {
2225     NvmeBounceContext *ctx = opaque;
2226     NvmeRequest *req = ctx->req;
2227     NvmeNamespace *ns = req->ns;
2228     BlockBackend *blk = ns->blkconf.blk;
2229     BlockAcctCookie *acct = &req->acct;
2230     BlockAcctStats *stats = blk_get_stats(blk);
2231     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2232     uint64_t slba = le64_to_cpu(rw->slba);
2233     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2234     uint16_t apptag = le16_to_cpu(rw->apptag);
2235     uint16_t appmask = le16_to_cpu(rw->appmask);
2236     uint64_t reftag = le32_to_cpu(rw->reftag);
2237     uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2238     uint16_t status;
2239 
2240     reftag |= cdw3 << 32;
2241 
2242     trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
2243 
2244     if (ret) {
2245         block_acct_failed(stats, acct);
2246         nvme_aio_err(req, ret);
2247         goto out;
2248     }
2249 
2250     block_acct_done(stats, acct);
2251 
2252     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2253         status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
2254                                        ctx->mdata.iov.size, slba);
2255         if (status) {
2256             req->status = status;
2257             goto out;
2258         }
2259 
2260         req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2261                                      ctx->mdata.bounce, ctx->mdata.iov.size,
2262                                      prinfo, slba, apptag, appmask, &reftag);
2263     }
2264 
2265 out:
2266     qemu_iovec_destroy(&ctx->data.iov);
2267     g_free(ctx->data.bounce);
2268 
2269     qemu_iovec_destroy(&ctx->mdata.iov);
2270     g_free(ctx->mdata.bounce);
2271 
2272     g_free(ctx);
2273 
2274     nvme_enqueue_req_completion(nvme_cq(req), req);
2275 }
2276 
2277 
2278 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2279 {
2280     NvmeBounceContext *ctx = opaque;
2281     NvmeRequest *req = ctx->req;
2282     NvmeNamespace *ns = req->ns;
2283     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2284     uint64_t slba = le64_to_cpu(rw->slba);
2285     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2286     size_t mlen = nvme_m2b(ns, nlb);
2287     uint64_t offset = nvme_moff(ns, slba);
2288     BlockBackend *blk = ns->blkconf.blk;
2289 
2290     trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2291 
2292     if (ret) {
2293         goto out;
2294     }
2295 
2296     ctx->mdata.bounce = g_malloc(mlen);
2297 
2298     qemu_iovec_reset(&ctx->mdata.iov);
2299     qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2300 
2301     req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2302                                 nvme_verify_cb, ctx);
2303     return;
2304 
2305 out:
2306     nvme_verify_cb(ctx, ret);
2307 }
2308 
2309 struct nvme_compare_ctx {
2310     struct {
2311         QEMUIOVector iov;
2312         uint8_t *bounce;
2313     } data;
2314 
2315     struct {
2316         QEMUIOVector iov;
2317         uint8_t *bounce;
2318     } mdata;
2319 };
2320 
2321 static void nvme_compare_mdata_cb(void *opaque, int ret)
2322 {
2323     NvmeRequest *req = opaque;
2324     NvmeNamespace *ns = req->ns;
2325     NvmeCtrl *n = nvme_ctrl(req);
2326     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2327     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2328     uint16_t apptag = le16_to_cpu(rw->apptag);
2329     uint16_t appmask = le16_to_cpu(rw->appmask);
2330     uint64_t reftag = le32_to_cpu(rw->reftag);
2331     uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2332     struct nvme_compare_ctx *ctx = req->opaque;
2333     g_autofree uint8_t *buf = NULL;
2334     BlockBackend *blk = ns->blkconf.blk;
2335     BlockAcctCookie *acct = &req->acct;
2336     BlockAcctStats *stats = blk_get_stats(blk);
2337     uint16_t status = NVME_SUCCESS;
2338 
2339     reftag |= cdw3 << 32;
2340 
2341     trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2342 
2343     if (ret) {
2344         block_acct_failed(stats, acct);
2345         nvme_aio_err(req, ret);
2346         goto out;
2347     }
2348 
2349     buf = g_malloc(ctx->mdata.iov.size);
2350 
2351     status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2352                                NVME_TX_DIRECTION_TO_DEVICE, req);
2353     if (status) {
2354         req->status = status;
2355         goto out;
2356     }
2357 
2358     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2359         uint64_t slba = le64_to_cpu(rw->slba);
2360         uint8_t *bufp;
2361         uint8_t *mbufp = ctx->mdata.bounce;
2362         uint8_t *end = mbufp + ctx->mdata.iov.size;
2363         int16_t pil = 0;
2364 
2365         status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2366                                 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2367                                 slba, apptag, appmask, &reftag);
2368         if (status) {
2369             req->status = status;
2370             goto out;
2371         }
2372 
2373         /*
2374          * When formatted with protection information, do not compare the DIF
2375          * tuple.
2376          */
2377         if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2378             pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
2379         }
2380 
2381         for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2382             if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2383                 req->status = NVME_CMP_FAILURE | NVME_DNR;
2384                 goto out;
2385             }
2386         }
2387 
2388         goto out;
2389     }
2390 
2391     if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2392         req->status = NVME_CMP_FAILURE | NVME_DNR;
2393         goto out;
2394     }
2395 
2396     block_acct_done(stats, acct);
2397 
2398 out:
2399     qemu_iovec_destroy(&ctx->data.iov);
2400     g_free(ctx->data.bounce);
2401 
2402     qemu_iovec_destroy(&ctx->mdata.iov);
2403     g_free(ctx->mdata.bounce);
2404 
2405     g_free(ctx);
2406 
2407     nvme_enqueue_req_completion(nvme_cq(req), req);
2408 }
2409 
2410 static void nvme_compare_data_cb(void *opaque, int ret)
2411 {
2412     NvmeRequest *req = opaque;
2413     NvmeCtrl *n = nvme_ctrl(req);
2414     NvmeNamespace *ns = req->ns;
2415     BlockBackend *blk = ns->blkconf.blk;
2416     BlockAcctCookie *acct = &req->acct;
2417     BlockAcctStats *stats = blk_get_stats(blk);
2418 
2419     struct nvme_compare_ctx *ctx = req->opaque;
2420     g_autofree uint8_t *buf = NULL;
2421     uint16_t status;
2422 
2423     trace_pci_nvme_compare_data_cb(nvme_cid(req));
2424 
2425     if (ret) {
2426         block_acct_failed(stats, acct);
2427         nvme_aio_err(req, ret);
2428         goto out;
2429     }
2430 
2431     buf = g_malloc(ctx->data.iov.size);
2432 
2433     status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2434                               NVME_TX_DIRECTION_TO_DEVICE, req);
2435     if (status) {
2436         req->status = status;
2437         goto out;
2438     }
2439 
2440     if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2441         req->status = NVME_CMP_FAILURE | NVME_DNR;
2442         goto out;
2443     }
2444 
2445     if (ns->lbaf.ms) {
2446         NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2447         uint64_t slba = le64_to_cpu(rw->slba);
2448         uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2449         size_t mlen = nvme_m2b(ns, nlb);
2450         uint64_t offset = nvme_moff(ns, slba);
2451 
2452         ctx->mdata.bounce = g_malloc(mlen);
2453 
2454         qemu_iovec_init(&ctx->mdata.iov, 1);
2455         qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2456 
2457         req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2458                                     nvme_compare_mdata_cb, req);
2459         return;
2460     }
2461 
2462     block_acct_done(stats, acct);
2463 
2464 out:
2465     qemu_iovec_destroy(&ctx->data.iov);
2466     g_free(ctx->data.bounce);
2467     g_free(ctx);
2468 
2469     nvme_enqueue_req_completion(nvme_cq(req), req);
2470 }
2471 
2472 typedef struct NvmeDSMAIOCB {
2473     BlockAIOCB common;
2474     BlockAIOCB *aiocb;
2475     NvmeRequest *req;
2476     int ret;
2477 
2478     NvmeDsmRange *range;
2479     unsigned int nr;
2480     unsigned int idx;
2481 } NvmeDSMAIOCB;
2482 
2483 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2484 {
2485     NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2486 
2487     /* break nvme_dsm_cb loop */
2488     iocb->idx = iocb->nr;
2489     iocb->ret = -ECANCELED;
2490 
2491     if (iocb->aiocb) {
2492         blk_aio_cancel_async(iocb->aiocb);
2493         iocb->aiocb = NULL;
2494     } else {
2495         /*
2496          * We only reach this if nvme_dsm_cancel() has already been called or
2497          * the command ran to completion.
2498          */
2499         assert(iocb->idx == iocb->nr);
2500     }
2501 }
2502 
2503 static const AIOCBInfo nvme_dsm_aiocb_info = {
2504     .aiocb_size   = sizeof(NvmeDSMAIOCB),
2505     .cancel_async = nvme_dsm_cancel,
2506 };
2507 
2508 static void nvme_dsm_cb(void *opaque, int ret);
2509 
2510 static void nvme_dsm_md_cb(void *opaque, int ret)
2511 {
2512     NvmeDSMAIOCB *iocb = opaque;
2513     NvmeRequest *req = iocb->req;
2514     NvmeNamespace *ns = req->ns;
2515     NvmeDsmRange *range;
2516     uint64_t slba;
2517     uint32_t nlb;
2518 
2519     if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
2520         goto done;
2521     }
2522 
2523     range = &iocb->range[iocb->idx - 1];
2524     slba = le64_to_cpu(range->slba);
2525     nlb = le32_to_cpu(range->nlb);
2526 
2527     /*
2528      * Check that all block were discarded (zeroed); otherwise we do not zero
2529      * the metadata.
2530      */
2531 
2532     ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2533     if (ret) {
2534         if (ret < 0) {
2535             goto done;
2536         }
2537 
2538         nvme_dsm_cb(iocb, 0);
2539         return;
2540     }
2541 
2542     iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2543                                         nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2544                                         nvme_dsm_cb, iocb);
2545     return;
2546 
2547 done:
2548     nvme_dsm_cb(iocb, ret);
2549 }
2550 
2551 static void nvme_dsm_cb(void *opaque, int ret)
2552 {
2553     NvmeDSMAIOCB *iocb = opaque;
2554     NvmeRequest *req = iocb->req;
2555     NvmeCtrl *n = nvme_ctrl(req);
2556     NvmeNamespace *ns = req->ns;
2557     NvmeDsmRange *range;
2558     uint64_t slba;
2559     uint32_t nlb;
2560 
2561     if (iocb->ret < 0) {
2562         goto done;
2563     } else if (ret < 0) {
2564         iocb->ret = ret;
2565         goto done;
2566     }
2567 
2568 next:
2569     if (iocb->idx == iocb->nr) {
2570         goto done;
2571     }
2572 
2573     range = &iocb->range[iocb->idx++];
2574     slba = le64_to_cpu(range->slba);
2575     nlb = le32_to_cpu(range->nlb);
2576 
2577     trace_pci_nvme_dsm_deallocate(slba, nlb);
2578 
2579     if (nlb > n->dmrsl) {
2580         trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2581         goto next;
2582     }
2583 
2584     if (nvme_check_bounds(ns, slba, nlb)) {
2585         trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2586                                              ns->id_ns.nsze);
2587         goto next;
2588     }
2589 
2590     iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2591                                    nvme_l2b(ns, nlb),
2592                                    nvme_dsm_md_cb, iocb);
2593     return;
2594 
2595 done:
2596     iocb->aiocb = NULL;
2597     iocb->common.cb(iocb->common.opaque, iocb->ret);
2598     g_free(iocb->range);
2599     qemu_aio_unref(iocb);
2600 }
2601 
2602 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2603 {
2604     NvmeNamespace *ns = req->ns;
2605     NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2606     uint32_t attr = le32_to_cpu(dsm->attributes);
2607     uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2608     uint16_t status = NVME_SUCCESS;
2609 
2610     trace_pci_nvme_dsm(nr, attr);
2611 
2612     if (attr & NVME_DSMGMT_AD) {
2613         NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2614                                          nvme_misc_cb, req);
2615 
2616         iocb->req = req;
2617         iocb->ret = 0;
2618         iocb->range = g_new(NvmeDsmRange, nr);
2619         iocb->nr = nr;
2620         iocb->idx = 0;
2621 
2622         status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2623                           req);
2624         if (status) {
2625             g_free(iocb->range);
2626             qemu_aio_unref(iocb);
2627 
2628             return status;
2629         }
2630 
2631         req->aiocb = &iocb->common;
2632         nvme_dsm_cb(iocb, 0);
2633 
2634         return NVME_NO_COMPLETE;
2635     }
2636 
2637     return status;
2638 }
2639 
2640 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2641 {
2642     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2643     NvmeNamespace *ns = req->ns;
2644     BlockBackend *blk = ns->blkconf.blk;
2645     uint64_t slba = le64_to_cpu(rw->slba);
2646     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2647     size_t len = nvme_l2b(ns, nlb);
2648     int64_t offset = nvme_l2b(ns, slba);
2649     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2650     uint32_t reftag = le32_to_cpu(rw->reftag);
2651     NvmeBounceContext *ctx = NULL;
2652     uint16_t status;
2653 
2654     trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2655 
2656     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2657         status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2658         if (status) {
2659             return status;
2660         }
2661 
2662         if (prinfo & NVME_PRINFO_PRACT) {
2663             return NVME_INVALID_PROT_INFO | NVME_DNR;
2664         }
2665     }
2666 
2667     if (len > n->page_size << n->params.vsl) {
2668         return NVME_INVALID_FIELD | NVME_DNR;
2669     }
2670 
2671     status = nvme_check_bounds(ns, slba, nlb);
2672     if (status) {
2673         return status;
2674     }
2675 
2676     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2677         status = nvme_check_dulbe(ns, slba, nlb);
2678         if (status) {
2679             return status;
2680         }
2681     }
2682 
2683     ctx = g_new0(NvmeBounceContext, 1);
2684     ctx->req = req;
2685 
2686     ctx->data.bounce = g_malloc(len);
2687 
2688     qemu_iovec_init(&ctx->data.iov, 1);
2689     qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2690 
2691     block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2692                      BLOCK_ACCT_READ);
2693 
2694     req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2695                                 nvme_verify_mdata_in_cb, ctx);
2696     return NVME_NO_COMPLETE;
2697 }
2698 
2699 typedef struct NvmeCopyAIOCB {
2700     BlockAIOCB common;
2701     BlockAIOCB *aiocb;
2702     NvmeRequest *req;
2703     NvmeCtrl *n;
2704     int ret;
2705 
2706     void *ranges;
2707     unsigned int format;
2708     int nr;
2709     int idx;
2710 
2711     uint8_t *bounce;
2712     QEMUIOVector iov;
2713     struct {
2714         BlockAcctCookie read;
2715         BlockAcctCookie write;
2716     } acct;
2717 
2718     uint64_t reftag;
2719     uint64_t slba;
2720 
2721     NvmeZone *zone;
2722     NvmeNamespace *sns;
2723     uint32_t tcl;
2724 } NvmeCopyAIOCB;
2725 
2726 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2727 {
2728     NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2729 
2730     iocb->ret = -ECANCELED;
2731 
2732     if (iocb->aiocb) {
2733         blk_aio_cancel_async(iocb->aiocb);
2734         iocb->aiocb = NULL;
2735     }
2736 }
2737 
2738 static const AIOCBInfo nvme_copy_aiocb_info = {
2739     .aiocb_size   = sizeof(NvmeCopyAIOCB),
2740     .cancel_async = nvme_copy_cancel,
2741 };
2742 
2743 static void nvme_copy_done(NvmeCopyAIOCB *iocb)
2744 {
2745     NvmeRequest *req = iocb->req;
2746     NvmeNamespace *ns = req->ns;
2747     BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2748 
2749     if (iocb->idx != iocb->nr) {
2750         req->cqe.result = cpu_to_le32(iocb->idx);
2751     }
2752 
2753     qemu_iovec_destroy(&iocb->iov);
2754     g_free(iocb->bounce);
2755 
2756     if (iocb->ret < 0) {
2757         block_acct_failed(stats, &iocb->acct.read);
2758         block_acct_failed(stats, &iocb->acct.write);
2759     } else {
2760         block_acct_done(stats, &iocb->acct.read);
2761         block_acct_done(stats, &iocb->acct.write);
2762     }
2763 
2764     iocb->common.cb(iocb->common.opaque, iocb->ret);
2765     qemu_aio_unref(iocb);
2766 }
2767 
2768 static void nvme_do_copy(NvmeCopyAIOCB *iocb);
2769 
2770 static void nvme_copy_source_range_parse_format0_2(void *ranges,
2771                                                    int idx, uint64_t *slba,
2772                                                    uint32_t *nlb,
2773                                                    uint32_t *snsid,
2774                                                    uint16_t *apptag,
2775                                                    uint16_t *appmask,
2776                                                    uint64_t *reftag)
2777 {
2778     NvmeCopySourceRangeFormat0_2 *_ranges = ranges;
2779 
2780     if (snsid) {
2781         *snsid = le32_to_cpu(_ranges[idx].sparams);
2782     }
2783 
2784     if (slba) {
2785         *slba = le64_to_cpu(_ranges[idx].slba);
2786     }
2787 
2788     if (nlb) {
2789         *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2790     }
2791 
2792     if (apptag) {
2793         *apptag = le16_to_cpu(_ranges[idx].apptag);
2794     }
2795 
2796     if (appmask) {
2797         *appmask = le16_to_cpu(_ranges[idx].appmask);
2798     }
2799 
2800     if (reftag) {
2801         *reftag = le32_to_cpu(_ranges[idx].reftag);
2802     }
2803 }
2804 
2805 static void nvme_copy_source_range_parse_format1_3(void *ranges, int idx,
2806                                                    uint64_t *slba,
2807                                                    uint32_t *nlb,
2808                                                    uint32_t *snsid,
2809                                                    uint16_t *apptag,
2810                                                    uint16_t *appmask,
2811                                                    uint64_t *reftag)
2812 {
2813     NvmeCopySourceRangeFormat1_3 *_ranges = ranges;
2814 
2815     if (snsid) {
2816         *snsid = le32_to_cpu(_ranges[idx].sparams);
2817     }
2818 
2819     if (slba) {
2820         *slba = le64_to_cpu(_ranges[idx].slba);
2821     }
2822 
2823     if (nlb) {
2824         *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2825     }
2826 
2827     if (apptag) {
2828         *apptag = le16_to_cpu(_ranges[idx].apptag);
2829     }
2830 
2831     if (appmask) {
2832         *appmask = le16_to_cpu(_ranges[idx].appmask);
2833     }
2834 
2835     if (reftag) {
2836         *reftag = 0;
2837 
2838         *reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
2839         *reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
2840         *reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
2841         *reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
2842         *reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
2843         *reftag |= (uint64_t)_ranges[idx].sr[9];
2844     }
2845 }
2846 
2847 static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
2848                                          uint64_t *slba, uint32_t *nlb,
2849                                          uint32_t *snsid, uint16_t *apptag,
2850                                          uint16_t *appmask, uint64_t *reftag)
2851 {
2852     switch (format) {
2853     case NVME_COPY_FORMAT_0:
2854     case NVME_COPY_FORMAT_2:
2855         nvme_copy_source_range_parse_format0_2(ranges, idx, slba, nlb, snsid,
2856                                                apptag, appmask, reftag);
2857         break;
2858 
2859     case NVME_COPY_FORMAT_1:
2860     case NVME_COPY_FORMAT_3:
2861         nvme_copy_source_range_parse_format1_3(ranges, idx, slba, nlb, snsid,
2862                                                apptag, appmask, reftag);
2863         break;
2864 
2865     default:
2866         abort();
2867     }
2868 }
2869 
2870 static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
2871                                            NvmeCopyAIOCB *iocb, uint16_t nr)
2872 {
2873     uint32_t copy_len = 0;
2874 
2875     for (int idx = 0; idx < nr; idx++) {
2876         uint32_t nlb;
2877         nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
2878                                      &nlb, NULL, NULL, NULL, NULL);
2879         copy_len += nlb;
2880     }
2881     iocb->tcl = copy_len;
2882     if (copy_len > ns->id_ns.mcl) {
2883         return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2884     }
2885 
2886     return NVME_SUCCESS;
2887 }
2888 
2889 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2890 {
2891     NvmeCopyAIOCB *iocb = opaque;
2892     NvmeRequest *req = iocb->req;
2893     NvmeNamespace *dns = req->ns;
2894     uint32_t nlb;
2895 
2896     nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2897                                  &nlb, NULL, NULL, NULL, NULL);
2898 
2899     if (ret < 0) {
2900         iocb->ret = ret;
2901         goto out;
2902     } else if (iocb->ret < 0) {
2903         goto out;
2904     }
2905 
2906     if (dns->params.zoned) {
2907         nvme_advance_zone_wp(dns, iocb->zone, nlb);
2908     }
2909 
2910     iocb->idx++;
2911     iocb->slba += nlb;
2912 out:
2913     nvme_do_copy(iocb);
2914 }
2915 
2916 static void nvme_copy_out_cb(void *opaque, int ret)
2917 {
2918     NvmeCopyAIOCB *iocb = opaque;
2919     NvmeRequest *req = iocb->req;
2920     NvmeNamespace *dns = req->ns;
2921     uint32_t nlb;
2922     size_t mlen;
2923     uint8_t *mbounce;
2924 
2925     if (ret < 0 || iocb->ret < 0 || !dns->lbaf.ms) {
2926         goto out;
2927     }
2928 
2929     nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2930                                  &nlb, NULL, NULL, NULL, NULL);
2931 
2932     mlen = nvme_m2b(dns, nlb);
2933     mbounce = iocb->bounce + nvme_l2b(dns, nlb);
2934 
2935     qemu_iovec_reset(&iocb->iov);
2936     qemu_iovec_add(&iocb->iov, mbounce, mlen);
2937 
2938     iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_moff(dns, iocb->slba),
2939                                   &iocb->iov, 0, nvme_copy_out_completed_cb,
2940                                   iocb);
2941 
2942     return;
2943 
2944 out:
2945     nvme_copy_out_completed_cb(iocb, ret);
2946 }
2947 
2948 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2949 {
2950     NvmeCopyAIOCB *iocb = opaque;
2951     NvmeRequest *req = iocb->req;
2952     NvmeNamespace *sns = iocb->sns;
2953     NvmeNamespace *dns = req->ns;
2954     NvmeCopyCmd *copy = NULL;
2955     uint8_t *mbounce = NULL;
2956     uint32_t nlb;
2957     uint64_t slba;
2958     uint16_t apptag, appmask;
2959     uint64_t reftag;
2960     size_t len, mlen;
2961     uint16_t status;
2962 
2963     if (ret < 0) {
2964         iocb->ret = ret;
2965         goto out;
2966     } else if (iocb->ret < 0) {
2967         goto out;
2968     }
2969 
2970     nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2971                                  &nlb, NULL, &apptag, &appmask, &reftag);
2972 
2973     trace_pci_nvme_copy_out(iocb->slba, nlb);
2974 
2975     len = nvme_l2b(sns, nlb);
2976 
2977     if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps)) {
2978         copy = (NvmeCopyCmd *)&req->cmd;
2979 
2980         uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2981 
2982         mlen = nvme_m2b(sns, nlb);
2983         mbounce = iocb->bounce + nvme_l2b(sns, nlb);
2984 
2985         status = nvme_dif_mangle_mdata(sns, mbounce, mlen, slba);
2986         if (status) {
2987             goto invalid;
2988         }
2989         status = nvme_dif_check(sns, iocb->bounce, len, mbounce, mlen, prinfor,
2990                                 slba, apptag, appmask, &reftag);
2991         if (status) {
2992             goto invalid;
2993         }
2994     }
2995 
2996     if (NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
2997         copy = (NvmeCopyCmd *)&req->cmd;
2998         uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2999 
3000         mlen = nvme_m2b(dns, nlb);
3001         mbounce = iocb->bounce + nvme_l2b(dns, nlb);
3002 
3003         apptag = le16_to_cpu(copy->apptag);
3004         appmask = le16_to_cpu(copy->appmask);
3005 
3006         if (prinfow & NVME_PRINFO_PRACT) {
3007             status = nvme_check_prinfo(dns, prinfow, iocb->slba, iocb->reftag);
3008             if (status) {
3009                 goto invalid;
3010             }
3011 
3012             nvme_dif_pract_generate_dif(dns, iocb->bounce, len, mbounce, mlen,
3013                                         apptag, &iocb->reftag);
3014         } else {
3015             status = nvme_dif_check(dns, iocb->bounce, len, mbounce, mlen,
3016                                     prinfow, iocb->slba, apptag, appmask,
3017                                     &iocb->reftag);
3018             if (status) {
3019                 goto invalid;
3020             }
3021         }
3022     }
3023 
3024     status = nvme_check_bounds(dns, iocb->slba, nlb);
3025     if (status) {
3026         goto invalid;
3027     }
3028 
3029     if (dns->params.zoned) {
3030         status = nvme_check_zone_write(dns, iocb->zone, iocb->slba, nlb);
3031         if (status) {
3032             goto invalid;
3033         }
3034 
3035         if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) {
3036             iocb->zone->w_ptr += nlb;
3037         }
3038     }
3039 
3040     qemu_iovec_reset(&iocb->iov);
3041     qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3042 
3043     block_acct_start(blk_get_stats(dns->blkconf.blk), &iocb->acct.write, 0,
3044                      BLOCK_ACCT_WRITE);
3045 
3046     iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_l2b(dns, iocb->slba),
3047                                   &iocb->iov, 0, nvme_copy_out_cb, iocb);
3048 
3049     return;
3050 
3051 invalid:
3052     req->status = status;
3053     iocb->ret = -1;
3054 out:
3055     nvme_do_copy(iocb);
3056 }
3057 
3058 static void nvme_copy_in_cb(void *opaque, int ret)
3059 {
3060     NvmeCopyAIOCB *iocb = opaque;
3061     NvmeNamespace *sns = iocb->sns;
3062     uint64_t slba;
3063     uint32_t nlb;
3064 
3065     if (ret < 0 || iocb->ret < 0 || !sns->lbaf.ms) {
3066         goto out;
3067     }
3068 
3069     nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3070                                  &nlb, NULL, NULL, NULL, NULL);
3071 
3072     qemu_iovec_reset(&iocb->iov);
3073     qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(sns, nlb),
3074                    nvme_m2b(sns, nlb));
3075 
3076     iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_moff(sns, slba),
3077                                  &iocb->iov, 0, nvme_copy_in_completed_cb,
3078                                  iocb);
3079     return;
3080 
3081 out:
3082     nvme_copy_in_completed_cb(iocb, ret);
3083 }
3084 
3085 static inline bool nvme_csi_supports_copy(uint8_t csi)
3086 {
3087     return csi == NVME_CSI_NVM || csi == NVME_CSI_ZONED;
3088 }
3089 
3090 static inline bool nvme_copy_ns_format_match(NvmeNamespace *sns,
3091                                              NvmeNamespace *dns)
3092 {
3093     return sns->lbaf.ds == dns->lbaf.ds && sns->lbaf.ms == dns->lbaf.ms;
3094 }
3095 
3096 static bool nvme_copy_matching_ns_format(NvmeNamespace *sns, NvmeNamespace *dns,
3097                                          bool pi_enable)
3098 {
3099     if (!nvme_csi_supports_copy(sns->csi) ||
3100         !nvme_csi_supports_copy(dns->csi)) {
3101         return false;
3102     }
3103 
3104     if (!pi_enable && !nvme_copy_ns_format_match(sns, dns)) {
3105             return false;
3106     }
3107 
3108     if (pi_enable && (!nvme_copy_ns_format_match(sns, dns) ||
3109         sns->id_ns.dps != dns->id_ns.dps)) {
3110             return false;
3111     }
3112 
3113     return true;
3114 }
3115 
3116 static inline bool nvme_copy_corresp_pi_match(NvmeNamespace *sns,
3117                                               NvmeNamespace *dns)
3118 {
3119     return sns->lbaf.ms == 0 &&
3120            ((dns->lbaf.ms == 8 && dns->pif == 0) ||
3121            (dns->lbaf.ms == 16 && dns->pif == 1));
3122 }
3123 
3124 static bool nvme_copy_corresp_pi_format(NvmeNamespace *sns, NvmeNamespace *dns,
3125                                         bool sns_pi_en)
3126 {
3127     if (!nvme_csi_supports_copy(sns->csi) ||
3128         !nvme_csi_supports_copy(dns->csi)) {
3129         return false;
3130     }
3131 
3132     if (!sns_pi_en && !nvme_copy_corresp_pi_match(sns, dns)) {
3133         return false;
3134     }
3135 
3136     if (sns_pi_en && !nvme_copy_corresp_pi_match(dns, sns)) {
3137         return false;
3138     }
3139 
3140     return true;
3141 }
3142 
3143 static void nvme_do_copy(NvmeCopyAIOCB *iocb)
3144 {
3145     NvmeRequest *req = iocb->req;
3146     NvmeNamespace *sns;
3147     NvmeNamespace *dns = req->ns;
3148     NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3149     uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3150     uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3151     uint64_t slba;
3152     uint32_t nlb;
3153     size_t len;
3154     uint16_t status;
3155     uint32_t dnsid = le32_to_cpu(req->cmd.nsid);
3156     uint32_t snsid = dnsid;
3157 
3158     if (iocb->ret < 0) {
3159         goto done;
3160     }
3161 
3162     if (iocb->idx == iocb->nr) {
3163         goto done;
3164     }
3165 
3166     if (iocb->format == 2 || iocb->format == 3) {
3167         nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3168                                      &slba, &nlb, &snsid, NULL, NULL, NULL);
3169         if (snsid != dnsid) {
3170             if (snsid == NVME_NSID_BROADCAST ||
3171                 !nvme_nsid_valid(iocb->n, snsid)) {
3172                 status = NVME_INVALID_NSID | NVME_DNR;
3173                 goto invalid;
3174             }
3175             iocb->sns = nvme_ns(iocb->n, snsid);
3176             if (unlikely(!iocb->sns)) {
3177                 status = NVME_INVALID_FIELD | NVME_DNR;
3178                 goto invalid;
3179             }
3180         } else {
3181             if (((slba + nlb) > iocb->slba) &&
3182                 ((slba + nlb) < (iocb->slba + iocb->tcl))) {
3183                 status = NVME_CMD_OVERLAP_IO_RANGE | NVME_DNR;
3184                 goto invalid;
3185             }
3186         }
3187     } else {
3188         nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3189                                      &slba, &nlb, NULL, NULL, NULL, NULL);
3190     }
3191 
3192     sns = iocb->sns;
3193     if ((snsid == dnsid) && NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3194         ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
3195         status = NVME_INVALID_FIELD | NVME_DNR;
3196         goto invalid;
3197     } else if (snsid != dnsid) {
3198         if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3199             !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3200             if (!nvme_copy_matching_ns_format(sns, dns, false)) {
3201                 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3202                 goto invalid;
3203             }
3204         }
3205         if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3206             NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3207             if ((prinfor & NVME_PRINFO_PRACT) !=
3208                 (prinfow & NVME_PRINFO_PRACT)) {
3209                 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3210                 goto invalid;
3211             } else {
3212                 if (!nvme_copy_matching_ns_format(sns, dns, true)) {
3213                     status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3214                     goto invalid;
3215                 }
3216             }
3217         }
3218 
3219         if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3220             NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3221             if (!(prinfow & NVME_PRINFO_PRACT)) {
3222                 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3223                 goto invalid;
3224             } else {
3225                 if (!nvme_copy_corresp_pi_format(sns, dns, false)) {
3226                     status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3227                     goto invalid;
3228                 }
3229             }
3230         }
3231 
3232         if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3233             !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3234             if (!(prinfor & NVME_PRINFO_PRACT)) {
3235                 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3236                 goto invalid;
3237             } else {
3238                 if (!nvme_copy_corresp_pi_format(sns, dns, true)) {
3239                     status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3240                     goto invalid;
3241                 }
3242             }
3243         }
3244     }
3245     len = nvme_l2b(sns, nlb);
3246 
3247     trace_pci_nvme_copy_source_range(slba, nlb);
3248 
3249     if (nlb > le16_to_cpu(sns->id_ns.mssrl)) {
3250         status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3251         goto invalid;
3252     }
3253 
3254     status = nvme_check_bounds(sns, slba, nlb);
3255     if (status) {
3256         goto invalid;
3257     }
3258 
3259     if (NVME_ERR_REC_DULBE(sns->features.err_rec)) {
3260         status = nvme_check_dulbe(sns, slba, nlb);
3261         if (status) {
3262             goto invalid;
3263         }
3264     }
3265 
3266     if (sns->params.zoned) {
3267         status = nvme_check_zone_read(sns, slba, nlb);
3268         if (status) {
3269             goto invalid;
3270         }
3271     }
3272 
3273     g_free(iocb->bounce);
3274     iocb->bounce = g_malloc_n(le16_to_cpu(sns->id_ns.mssrl),
3275                               sns->lbasz + sns->lbaf.ms);
3276 
3277     qemu_iovec_reset(&iocb->iov);
3278     qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3279 
3280     block_acct_start(blk_get_stats(sns->blkconf.blk), &iocb->acct.read, 0,
3281                      BLOCK_ACCT_READ);
3282 
3283     iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_l2b(sns, slba),
3284                                  &iocb->iov, 0, nvme_copy_in_cb, iocb);
3285     return;
3286 
3287 invalid:
3288     req->status = status;
3289     iocb->ret = -1;
3290 done:
3291     nvme_copy_done(iocb);
3292 }
3293 
3294 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
3295 {
3296     NvmeNamespace *ns = req->ns;
3297     NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3298     NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
3299                                       nvme_misc_cb, req);
3300     uint16_t nr = copy->nr + 1;
3301     uint8_t format = copy->control[0] & 0xf;
3302     size_t len = sizeof(NvmeCopySourceRangeFormat0_2);
3303 
3304     uint16_t status;
3305 
3306     trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
3307 
3308     iocb->ranges = NULL;
3309     iocb->zone = NULL;
3310 
3311     if (!(n->id_ctrl.ocfs & (1 << format)) ||
3312         ((format == 2 || format == 3) &&
3313          !(n->features.hbs.cdfe & (1 << format)))) {
3314         trace_pci_nvme_err_copy_invalid_format(format);
3315         status = NVME_INVALID_FIELD | NVME_DNR;
3316         goto invalid;
3317     }
3318 
3319     if (nr > ns->id_ns.msrc + 1) {
3320         status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3321         goto invalid;
3322     }
3323 
3324     if ((ns->pif == 0x0 && (format != 0x0 && format != 0x2)) ||
3325         (ns->pif != 0x0 && (format != 0x1 && format != 0x3))) {
3326         status = NVME_INVALID_FORMAT | NVME_DNR;
3327         goto invalid;
3328     }
3329 
3330     if (ns->pif) {
3331         len = sizeof(NvmeCopySourceRangeFormat1_3);
3332     }
3333 
3334     iocb->format = format;
3335     iocb->ranges = g_malloc_n(nr, len);
3336     status = nvme_h2c(n, (uint8_t *)iocb->ranges, len * nr, req);
3337     if (status) {
3338         goto invalid;
3339     }
3340 
3341     iocb->slba = le64_to_cpu(copy->sdlba);
3342 
3343     if (ns->params.zoned) {
3344         iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
3345         if (!iocb->zone) {
3346             status = NVME_LBA_RANGE | NVME_DNR;
3347             goto invalid;
3348         }
3349 
3350         status = nvme_zrm_auto(n, ns, iocb->zone);
3351         if (status) {
3352             goto invalid;
3353         }
3354     }
3355 
3356     status = nvme_check_copy_mcl(ns, iocb, nr);
3357     if (status) {
3358         goto invalid;
3359     }
3360 
3361     iocb->req = req;
3362     iocb->ret = 0;
3363     iocb->nr = nr;
3364     iocb->idx = 0;
3365     iocb->reftag = le32_to_cpu(copy->reftag);
3366     iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32;
3367 
3368     qemu_iovec_init(&iocb->iov, 1);
3369 
3370     req->aiocb = &iocb->common;
3371     iocb->sns = req->ns;
3372     iocb->n = n;
3373     iocb->bounce = NULL;
3374     nvme_do_copy(iocb);
3375 
3376     return NVME_NO_COMPLETE;
3377 
3378 invalid:
3379     g_free(iocb->ranges);
3380     qemu_aio_unref(iocb);
3381     return status;
3382 }
3383 
3384 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
3385 {
3386     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3387     NvmeNamespace *ns = req->ns;
3388     BlockBackend *blk = ns->blkconf.blk;
3389     uint64_t slba = le64_to_cpu(rw->slba);
3390     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
3391     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3392     size_t data_len = nvme_l2b(ns, nlb);
3393     size_t len = data_len;
3394     int64_t offset = nvme_l2b(ns, slba);
3395     struct nvme_compare_ctx *ctx = NULL;
3396     uint16_t status;
3397 
3398     trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
3399 
3400     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
3401         return NVME_INVALID_PROT_INFO | NVME_DNR;
3402     }
3403 
3404     if (nvme_ns_ext(ns)) {
3405         len += nvme_m2b(ns, nlb);
3406     }
3407 
3408     status = nvme_check_mdts(n, len);
3409     if (status) {
3410         return status;
3411     }
3412 
3413     status = nvme_check_bounds(ns, slba, nlb);
3414     if (status) {
3415         return status;
3416     }
3417 
3418     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3419         status = nvme_check_dulbe(ns, slba, nlb);
3420         if (status) {
3421             return status;
3422         }
3423     }
3424 
3425     status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
3426     if (status) {
3427         return status;
3428     }
3429 
3430     ctx = g_new(struct nvme_compare_ctx, 1);
3431     ctx->data.bounce = g_malloc(data_len);
3432 
3433     req->opaque = ctx;
3434 
3435     qemu_iovec_init(&ctx->data.iov, 1);
3436     qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
3437 
3438     block_acct_start(blk_get_stats(blk), &req->acct, data_len,
3439                      BLOCK_ACCT_READ);
3440     req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
3441                                 nvme_compare_data_cb, req);
3442 
3443     return NVME_NO_COMPLETE;
3444 }
3445 
3446 typedef struct NvmeFlushAIOCB {
3447     BlockAIOCB common;
3448     BlockAIOCB *aiocb;
3449     NvmeRequest *req;
3450     int ret;
3451 
3452     NvmeNamespace *ns;
3453     uint32_t nsid;
3454     bool broadcast;
3455 } NvmeFlushAIOCB;
3456 
3457 static void nvme_flush_cancel(BlockAIOCB *acb)
3458 {
3459     NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
3460 
3461     iocb->ret = -ECANCELED;
3462 
3463     if (iocb->aiocb) {
3464         blk_aio_cancel_async(iocb->aiocb);
3465         iocb->aiocb = NULL;
3466     }
3467 }
3468 
3469 static const AIOCBInfo nvme_flush_aiocb_info = {
3470     .aiocb_size = sizeof(NvmeFlushAIOCB),
3471     .cancel_async = nvme_flush_cancel,
3472 };
3473 
3474 static void nvme_do_flush(NvmeFlushAIOCB *iocb);
3475 
3476 static void nvme_flush_ns_cb(void *opaque, int ret)
3477 {
3478     NvmeFlushAIOCB *iocb = opaque;
3479     NvmeNamespace *ns = iocb->ns;
3480 
3481     if (ret < 0) {
3482         iocb->ret = ret;
3483         goto out;
3484     } else if (iocb->ret < 0) {
3485         goto out;
3486     }
3487 
3488     if (ns) {
3489         trace_pci_nvme_flush_ns(iocb->nsid);
3490 
3491         iocb->ns = NULL;
3492         iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
3493         return;
3494     }
3495 
3496 out:
3497     nvme_do_flush(iocb);
3498 }
3499 
3500 static void nvme_do_flush(NvmeFlushAIOCB *iocb)
3501 {
3502     NvmeRequest *req = iocb->req;
3503     NvmeCtrl *n = nvme_ctrl(req);
3504     int i;
3505 
3506     if (iocb->ret < 0) {
3507         goto done;
3508     }
3509 
3510     if (iocb->broadcast) {
3511         for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
3512             iocb->ns = nvme_ns(n, i);
3513             if (iocb->ns) {
3514                 iocb->nsid = i;
3515                 break;
3516             }
3517         }
3518     }
3519 
3520     if (!iocb->ns) {
3521         goto done;
3522     }
3523 
3524     nvme_flush_ns_cb(iocb, 0);
3525     return;
3526 
3527 done:
3528     iocb->common.cb(iocb->common.opaque, iocb->ret);
3529     qemu_aio_unref(iocb);
3530 }
3531 
3532 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3533 {
3534     NvmeFlushAIOCB *iocb;
3535     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3536     uint16_t status;
3537 
3538     iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3539 
3540     iocb->req = req;
3541     iocb->ret = 0;
3542     iocb->ns = NULL;
3543     iocb->nsid = 0;
3544     iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3545 
3546     if (!iocb->broadcast) {
3547         if (!nvme_nsid_valid(n, nsid)) {
3548             status = NVME_INVALID_NSID | NVME_DNR;
3549             goto out;
3550         }
3551 
3552         iocb->ns = nvme_ns(n, nsid);
3553         if (!iocb->ns) {
3554             status = NVME_INVALID_FIELD | NVME_DNR;
3555             goto out;
3556         }
3557 
3558         iocb->nsid = nsid;
3559     }
3560 
3561     req->aiocb = &iocb->common;
3562     nvme_do_flush(iocb);
3563 
3564     return NVME_NO_COMPLETE;
3565 
3566 out:
3567     qemu_aio_unref(iocb);
3568 
3569     return status;
3570 }
3571 
3572 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3573 {
3574     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3575     NvmeNamespace *ns = req->ns;
3576     uint64_t slba = le64_to_cpu(rw->slba);
3577     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3578     uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3579     uint64_t data_size = nvme_l2b(ns, nlb);
3580     uint64_t mapped_size = data_size;
3581     uint64_t data_offset;
3582     BlockBackend *blk = ns->blkconf.blk;
3583     uint16_t status;
3584 
3585     if (nvme_ns_ext(ns)) {
3586         mapped_size += nvme_m2b(ns, nlb);
3587 
3588         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3589             bool pract = prinfo & NVME_PRINFO_PRACT;
3590 
3591             if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3592                 mapped_size = data_size;
3593             }
3594         }
3595     }
3596 
3597     trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3598 
3599     status = nvme_check_mdts(n, mapped_size);
3600     if (status) {
3601         goto invalid;
3602     }
3603 
3604     status = nvme_check_bounds(ns, slba, nlb);
3605     if (status) {
3606         goto invalid;
3607     }
3608 
3609     if (ns->params.zoned) {
3610         status = nvme_check_zone_read(ns, slba, nlb);
3611         if (status) {
3612             trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3613             goto invalid;
3614         }
3615     }
3616 
3617     if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3618         status = nvme_check_dulbe(ns, slba, nlb);
3619         if (status) {
3620             goto invalid;
3621         }
3622     }
3623 
3624     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3625         return nvme_dif_rw(n, req);
3626     }
3627 
3628     status = nvme_map_data(n, nlb, req);
3629     if (status) {
3630         goto invalid;
3631     }
3632 
3633     data_offset = nvme_l2b(ns, slba);
3634 
3635     block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3636                      BLOCK_ACCT_READ);
3637     nvme_blk_read(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3638     return NVME_NO_COMPLETE;
3639 
3640 invalid:
3641     block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3642     return status | NVME_DNR;
3643 }
3644 
3645 static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba,
3646                               uint32_t nlb)
3647 {
3648     NvmeNamespace *ns = req->ns;
3649     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3650     uint64_t data_size = nvme_l2b(ns, nlb);
3651     uint32_t dw12 = le32_to_cpu(req->cmd.cdw12);
3652     uint8_t dtype = (dw12 >> 20) & 0xf;
3653     uint16_t pid = le16_to_cpu(rw->dspec);
3654     uint16_t ph, rg, ruhid;
3655     NvmeReclaimUnit *ru;
3656 
3657     if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT ||
3658         !nvme_parse_pid(ns, pid, &ph, &rg)) {
3659         ph = 0;
3660         rg = 0;
3661     }
3662 
3663     ruhid = ns->fdp.phs[ph];
3664     ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg];
3665 
3666     nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size);
3667     nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size);
3668 
3669     while (nlb) {
3670         if (nlb < ru->ruamw) {
3671             ru->ruamw -= nlb;
3672             break;
3673         }
3674 
3675         nlb -= ru->ruamw;
3676         nvme_update_ruh(n, ns, pid);
3677     }
3678 }
3679 
3680 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3681                               bool wrz)
3682 {
3683     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3684     NvmeNamespace *ns = req->ns;
3685     uint64_t slba = le64_to_cpu(rw->slba);
3686     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3687     uint16_t ctrl = le16_to_cpu(rw->control);
3688     uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3689     uint64_t data_size = nvme_l2b(ns, nlb);
3690     uint64_t mapped_size = data_size;
3691     uint64_t data_offset;
3692     NvmeZone *zone;
3693     NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3694     BlockBackend *blk = ns->blkconf.blk;
3695     uint16_t status;
3696 
3697     if (nvme_ns_ext(ns)) {
3698         mapped_size += nvme_m2b(ns, nlb);
3699 
3700         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3701             bool pract = prinfo & NVME_PRINFO_PRACT;
3702 
3703             if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3704                 mapped_size -= nvme_m2b(ns, nlb);
3705             }
3706         }
3707     }
3708 
3709     trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3710                          nvme_nsid(ns), nlb, mapped_size, slba);
3711 
3712     if (!wrz) {
3713         status = nvme_check_mdts(n, mapped_size);
3714         if (status) {
3715             goto invalid;
3716         }
3717     }
3718 
3719     status = nvme_check_bounds(ns, slba, nlb);
3720     if (status) {
3721         goto invalid;
3722     }
3723 
3724     if (ns->params.zoned) {
3725         zone = nvme_get_zone_by_slba(ns, slba);
3726         assert(zone);
3727 
3728         if (append) {
3729             bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3730 
3731             if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3732                 return NVME_INVALID_ZONE_OP | NVME_DNR;
3733             }
3734 
3735             if (unlikely(slba != zone->d.zslba)) {
3736                 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3737                 status = NVME_INVALID_FIELD;
3738                 goto invalid;
3739             }
3740 
3741             if (n->params.zasl &&
3742                 data_size > (uint64_t)n->page_size << n->params.zasl) {
3743                 trace_pci_nvme_err_zasl(data_size);
3744                 return NVME_INVALID_FIELD | NVME_DNR;
3745             }
3746 
3747             slba = zone->w_ptr;
3748             rw->slba = cpu_to_le64(slba);
3749             res->slba = cpu_to_le64(slba);
3750 
3751             switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3752             case NVME_ID_NS_DPS_TYPE_1:
3753                 if (!piremap) {
3754                     return NVME_INVALID_PROT_INFO | NVME_DNR;
3755                 }
3756 
3757                 /* fallthrough */
3758 
3759             case NVME_ID_NS_DPS_TYPE_2:
3760                 if (piremap) {
3761                     uint32_t reftag = le32_to_cpu(rw->reftag);
3762                     rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3763                 }
3764 
3765                 break;
3766 
3767             case NVME_ID_NS_DPS_TYPE_3:
3768                 if (piremap) {
3769                     return NVME_INVALID_PROT_INFO | NVME_DNR;
3770                 }
3771 
3772                 break;
3773             }
3774         }
3775 
3776         status = nvme_check_zone_write(ns, zone, slba, nlb);
3777         if (status) {
3778             goto invalid;
3779         }
3780 
3781         status = nvme_zrm_auto(n, ns, zone);
3782         if (status) {
3783             goto invalid;
3784         }
3785 
3786         if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3787             zone->w_ptr += nlb;
3788         }
3789     } else if (ns->endgrp && ns->endgrp->fdp.enabled) {
3790         nvme_do_write_fdp(n, req, slba, nlb);
3791     }
3792 
3793     data_offset = nvme_l2b(ns, slba);
3794 
3795     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3796         return nvme_dif_rw(n, req);
3797     }
3798 
3799     if (!wrz) {
3800         status = nvme_map_data(n, nlb, req);
3801         if (status) {
3802             goto invalid;
3803         }
3804 
3805         block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3806                          BLOCK_ACCT_WRITE);
3807         nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3808     } else {
3809         req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3810                                            BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3811                                            req);
3812     }
3813 
3814     return NVME_NO_COMPLETE;
3815 
3816 invalid:
3817     block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3818     return status | NVME_DNR;
3819 }
3820 
3821 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3822 {
3823     return nvme_do_write(n, req, false, false);
3824 }
3825 
3826 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3827 {
3828     return nvme_do_write(n, req, false, true);
3829 }
3830 
3831 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3832 {
3833     return nvme_do_write(n, req, true, false);
3834 }
3835 
3836 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3837                                             uint64_t *slba, uint32_t *zone_idx)
3838 {
3839     uint32_t dw10 = le32_to_cpu(c->cdw10);
3840     uint32_t dw11 = le32_to_cpu(c->cdw11);
3841 
3842     if (!ns->params.zoned) {
3843         trace_pci_nvme_err_invalid_opc(c->opcode);
3844         return NVME_INVALID_OPCODE | NVME_DNR;
3845     }
3846 
3847     *slba = ((uint64_t)dw11) << 32 | dw10;
3848     if (unlikely(*slba >= ns->id_ns.nsze)) {
3849         trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3850         *slba = 0;
3851         return NVME_LBA_RANGE | NVME_DNR;
3852     }
3853 
3854     *zone_idx = nvme_zone_idx(ns, *slba);
3855     assert(*zone_idx < ns->num_zones);
3856 
3857     return NVME_SUCCESS;
3858 }
3859 
3860 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3861                                  NvmeRequest *);
3862 
3863 enum NvmeZoneProcessingMask {
3864     NVME_PROC_CURRENT_ZONE    = 0,
3865     NVME_PROC_OPENED_ZONES    = 1 << 0,
3866     NVME_PROC_CLOSED_ZONES    = 1 << 1,
3867     NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3868     NVME_PROC_FULL_ZONES      = 1 << 3,
3869 };
3870 
3871 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3872                                NvmeZoneState state, NvmeRequest *req)
3873 {
3874     NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3875     int flags = 0;
3876 
3877     if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) {
3878         uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3879 
3880         if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3881             return NVME_INVALID_ZONE_OP | NVME_DNR;
3882         }
3883 
3884         if (zone->w_ptr % ns->zns.zrwafg) {
3885             return NVME_NOZRWA | NVME_DNR;
3886         }
3887 
3888         flags = NVME_ZRM_ZRWA;
3889     }
3890 
3891     return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags);
3892 }
3893 
3894 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3895                                 NvmeZoneState state, NvmeRequest *req)
3896 {
3897     return nvme_zrm_close(ns, zone);
3898 }
3899 
3900 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3901                                  NvmeZoneState state, NvmeRequest *req)
3902 {
3903     return nvme_zrm_finish(ns, zone);
3904 }
3905 
3906 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3907                                   NvmeZoneState state, NvmeRequest *req)
3908 {
3909     switch (state) {
3910     case NVME_ZONE_STATE_READ_ONLY:
3911         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3912         /* fall through */
3913     case NVME_ZONE_STATE_OFFLINE:
3914         return NVME_SUCCESS;
3915     default:
3916         return NVME_ZONE_INVAL_TRANSITION;
3917     }
3918 }
3919 
3920 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3921 {
3922     uint16_t status;
3923     uint8_t state = nvme_get_zone_state(zone);
3924 
3925     if (state == NVME_ZONE_STATE_EMPTY) {
3926         status = nvme_aor_check(ns, 1, 0);
3927         if (status) {
3928             return status;
3929         }
3930         nvme_aor_inc_active(ns);
3931         zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3932         nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3933         return NVME_SUCCESS;
3934     }
3935 
3936     return NVME_ZONE_INVAL_TRANSITION;
3937 }
3938 
3939 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3940                                     enum NvmeZoneProcessingMask proc_mask,
3941                                     op_handler_t op_hndlr, NvmeRequest *req)
3942 {
3943     uint16_t status = NVME_SUCCESS;
3944     NvmeZoneState zs = nvme_get_zone_state(zone);
3945     bool proc_zone;
3946 
3947     switch (zs) {
3948     case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3949     case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3950         proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3951         break;
3952     case NVME_ZONE_STATE_CLOSED:
3953         proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3954         break;
3955     case NVME_ZONE_STATE_READ_ONLY:
3956         proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3957         break;
3958     case NVME_ZONE_STATE_FULL:
3959         proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3960         break;
3961     default:
3962         proc_zone = false;
3963     }
3964 
3965     if (proc_zone) {
3966         status = op_hndlr(ns, zone, zs, req);
3967     }
3968 
3969     return status;
3970 }
3971 
3972 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3973                                 enum NvmeZoneProcessingMask proc_mask,
3974                                 op_handler_t op_hndlr, NvmeRequest *req)
3975 {
3976     NvmeZone *next;
3977     uint16_t status = NVME_SUCCESS;
3978     int i;
3979 
3980     if (!proc_mask) {
3981         status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3982     } else {
3983         if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3984             QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3985                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3986                                              req);
3987                 if (status && status != NVME_NO_COMPLETE) {
3988                     goto out;
3989                 }
3990             }
3991         }
3992         if (proc_mask & NVME_PROC_OPENED_ZONES) {
3993             QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3994                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3995                                              req);
3996                 if (status && status != NVME_NO_COMPLETE) {
3997                     goto out;
3998                 }
3999             }
4000 
4001             QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
4002                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4003                                              req);
4004                 if (status && status != NVME_NO_COMPLETE) {
4005                     goto out;
4006                 }
4007             }
4008         }
4009         if (proc_mask & NVME_PROC_FULL_ZONES) {
4010             QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
4011                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4012                                              req);
4013                 if (status && status != NVME_NO_COMPLETE) {
4014                     goto out;
4015                 }
4016             }
4017         }
4018 
4019         if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
4020             for (i = 0; i < ns->num_zones; i++, zone++) {
4021                 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4022                                              req);
4023                 if (status && status != NVME_NO_COMPLETE) {
4024                     goto out;
4025                 }
4026             }
4027         }
4028     }
4029 
4030 out:
4031     return status;
4032 }
4033 
4034 typedef struct NvmeZoneResetAIOCB {
4035     BlockAIOCB common;
4036     BlockAIOCB *aiocb;
4037     NvmeRequest *req;
4038     int ret;
4039 
4040     bool all;
4041     int idx;
4042     NvmeZone *zone;
4043 } NvmeZoneResetAIOCB;
4044 
4045 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
4046 {
4047     NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
4048     NvmeRequest *req = iocb->req;
4049     NvmeNamespace *ns = req->ns;
4050 
4051     iocb->idx = ns->num_zones;
4052 
4053     iocb->ret = -ECANCELED;
4054 
4055     if (iocb->aiocb) {
4056         blk_aio_cancel_async(iocb->aiocb);
4057         iocb->aiocb = NULL;
4058     }
4059 }
4060 
4061 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
4062     .aiocb_size = sizeof(NvmeZoneResetAIOCB),
4063     .cancel_async = nvme_zone_reset_cancel,
4064 };
4065 
4066 static void nvme_zone_reset_cb(void *opaque, int ret);
4067 
4068 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
4069 {
4070     NvmeZoneResetAIOCB *iocb = opaque;
4071     NvmeRequest *req = iocb->req;
4072     NvmeNamespace *ns = req->ns;
4073     int64_t moff;
4074     int count;
4075 
4076     if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
4077         goto out;
4078     }
4079 
4080     moff = nvme_moff(ns, iocb->zone->d.zslba);
4081     count = nvme_m2b(ns, ns->zone_size);
4082 
4083     iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
4084                                         BDRV_REQ_MAY_UNMAP,
4085                                         nvme_zone_reset_cb, iocb);
4086     return;
4087 
4088 out:
4089     nvme_zone_reset_cb(iocb, ret);
4090 }
4091 
4092 static void nvme_zone_reset_cb(void *opaque, int ret)
4093 {
4094     NvmeZoneResetAIOCB *iocb = opaque;
4095     NvmeRequest *req = iocb->req;
4096     NvmeNamespace *ns = req->ns;
4097 
4098     if (iocb->ret < 0) {
4099         goto done;
4100     } else if (ret < 0) {
4101         iocb->ret = ret;
4102         goto done;
4103     }
4104 
4105     if (iocb->zone) {
4106         nvme_zrm_reset(ns, iocb->zone);
4107 
4108         if (!iocb->all) {
4109             goto done;
4110         }
4111     }
4112 
4113     while (iocb->idx < ns->num_zones) {
4114         NvmeZone *zone = &ns->zone_array[iocb->idx++];
4115 
4116         switch (nvme_get_zone_state(zone)) {
4117         case NVME_ZONE_STATE_EMPTY:
4118             if (!iocb->all) {
4119                 goto done;
4120             }
4121 
4122             continue;
4123 
4124         case NVME_ZONE_STATE_EXPLICITLY_OPEN:
4125         case NVME_ZONE_STATE_IMPLICITLY_OPEN:
4126         case NVME_ZONE_STATE_CLOSED:
4127         case NVME_ZONE_STATE_FULL:
4128             iocb->zone = zone;
4129             break;
4130 
4131         default:
4132             continue;
4133         }
4134 
4135         trace_pci_nvme_zns_zone_reset(zone->d.zslba);
4136 
4137         iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
4138                                             nvme_l2b(ns, zone->d.zslba),
4139                                             nvme_l2b(ns, ns->zone_size),
4140                                             BDRV_REQ_MAY_UNMAP,
4141                                             nvme_zone_reset_epilogue_cb,
4142                                             iocb);
4143         return;
4144     }
4145 
4146 done:
4147     iocb->aiocb = NULL;
4148 
4149     iocb->common.cb(iocb->common.opaque, iocb->ret);
4150     qemu_aio_unref(iocb);
4151 }
4152 
4153 static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone,
4154                                                uint64_t elba, NvmeRequest *req)
4155 {
4156     NvmeNamespace *ns = req->ns;
4157     uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
4158     uint64_t wp = zone->d.wp;
4159     uint32_t nlb = elba - wp + 1;
4160     uint16_t status;
4161 
4162 
4163     if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
4164         return NVME_INVALID_ZONE_OP | NVME_DNR;
4165     }
4166 
4167     if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
4168         return NVME_INVALID_FIELD | NVME_DNR;
4169     }
4170 
4171     if (elba < wp || elba > wp + ns->zns.zrwas) {
4172         return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
4173     }
4174 
4175     if (nlb % ns->zns.zrwafg) {
4176         return NVME_INVALID_FIELD | NVME_DNR;
4177     }
4178 
4179     status = nvme_zrm_auto(n, ns, zone);
4180     if (status) {
4181         return status;
4182     }
4183 
4184     zone->w_ptr += nlb;
4185 
4186     nvme_advance_zone_wp(ns, zone, nlb);
4187 
4188     return NVME_SUCCESS;
4189 }
4190 
4191 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4192 {
4193     NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
4194     NvmeNamespace *ns = req->ns;
4195     NvmeZone *zone;
4196     NvmeZoneResetAIOCB *iocb;
4197     uint8_t *zd_ext;
4198     uint64_t slba = 0;
4199     uint32_t zone_idx = 0;
4200     uint16_t status;
4201     uint8_t action = cmd->zsa;
4202     bool all;
4203     enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
4204 
4205     all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
4206 
4207     req->status = NVME_SUCCESS;
4208 
4209     if (!all) {
4210         status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
4211         if (status) {
4212             return status;
4213         }
4214     }
4215 
4216     zone = &ns->zone_array[zone_idx];
4217     if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) {
4218         trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
4219         return NVME_INVALID_FIELD | NVME_DNR;
4220     }
4221 
4222     switch (action) {
4223 
4224     case NVME_ZONE_ACTION_OPEN:
4225         if (all) {
4226             proc_mask = NVME_PROC_CLOSED_ZONES;
4227         }
4228         trace_pci_nvme_open_zone(slba, zone_idx, all);
4229         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
4230         break;
4231 
4232     case NVME_ZONE_ACTION_CLOSE:
4233         if (all) {
4234             proc_mask = NVME_PROC_OPENED_ZONES;
4235         }
4236         trace_pci_nvme_close_zone(slba, zone_idx, all);
4237         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
4238         break;
4239 
4240     case NVME_ZONE_ACTION_FINISH:
4241         if (all) {
4242             proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
4243         }
4244         trace_pci_nvme_finish_zone(slba, zone_idx, all);
4245         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
4246         break;
4247 
4248     case NVME_ZONE_ACTION_RESET:
4249         trace_pci_nvme_reset_zone(slba, zone_idx, all);
4250 
4251         iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
4252                            nvme_misc_cb, req);
4253 
4254         iocb->req = req;
4255         iocb->ret = 0;
4256         iocb->all = all;
4257         iocb->idx = zone_idx;
4258         iocb->zone = NULL;
4259 
4260         req->aiocb = &iocb->common;
4261         nvme_zone_reset_cb(iocb, 0);
4262 
4263         return NVME_NO_COMPLETE;
4264 
4265     case NVME_ZONE_ACTION_OFFLINE:
4266         if (all) {
4267             proc_mask = NVME_PROC_READ_ONLY_ZONES;
4268         }
4269         trace_pci_nvme_offline_zone(slba, zone_idx, all);
4270         status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
4271         break;
4272 
4273     case NVME_ZONE_ACTION_SET_ZD_EXT:
4274         trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
4275         if (all || !ns->params.zd_extension_size) {
4276             return NVME_INVALID_FIELD | NVME_DNR;
4277         }
4278         zd_ext = nvme_get_zd_extension(ns, zone_idx);
4279         status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
4280         if (status) {
4281             trace_pci_nvme_err_zd_extension_map_error(zone_idx);
4282             return status;
4283         }
4284 
4285         status = nvme_set_zd_ext(ns, zone);
4286         if (status == NVME_SUCCESS) {
4287             trace_pci_nvme_zd_extension_set(zone_idx);
4288             return status;
4289         }
4290         break;
4291 
4292     case NVME_ZONE_ACTION_ZRWA_FLUSH:
4293         if (all) {
4294             return NVME_INVALID_FIELD | NVME_DNR;
4295         }
4296 
4297         return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req);
4298 
4299     default:
4300         trace_pci_nvme_err_invalid_mgmt_action(action);
4301         status = NVME_INVALID_FIELD;
4302     }
4303 
4304     if (status == NVME_ZONE_INVAL_TRANSITION) {
4305         trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
4306                                                          zone->d.za);
4307     }
4308     if (status) {
4309         status |= NVME_DNR;
4310     }
4311 
4312     return status;
4313 }
4314 
4315 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
4316 {
4317     NvmeZoneState zs = nvme_get_zone_state(zl);
4318 
4319     switch (zafs) {
4320     case NVME_ZONE_REPORT_ALL:
4321         return true;
4322     case NVME_ZONE_REPORT_EMPTY:
4323         return zs == NVME_ZONE_STATE_EMPTY;
4324     case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
4325         return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
4326     case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
4327         return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
4328     case NVME_ZONE_REPORT_CLOSED:
4329         return zs == NVME_ZONE_STATE_CLOSED;
4330     case NVME_ZONE_REPORT_FULL:
4331         return zs == NVME_ZONE_STATE_FULL;
4332     case NVME_ZONE_REPORT_READ_ONLY:
4333         return zs == NVME_ZONE_STATE_READ_ONLY;
4334     case NVME_ZONE_REPORT_OFFLINE:
4335         return zs == NVME_ZONE_STATE_OFFLINE;
4336     default:
4337         return false;
4338     }
4339 }
4340 
4341 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4342 {
4343     NvmeCmd *cmd = &req->cmd;
4344     NvmeNamespace *ns = req->ns;
4345     /* cdw12 is zero-based number of dwords to return. Convert to bytes */
4346     uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
4347     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4348     uint32_t zone_idx, zra, zrasf, partial;
4349     uint64_t max_zones, nr_zones = 0;
4350     uint16_t status;
4351     uint64_t slba;
4352     NvmeZoneDescr *z;
4353     NvmeZone *zone;
4354     NvmeZoneReportHeader *header;
4355     void *buf, *buf_p;
4356     size_t zone_entry_sz;
4357     int i;
4358 
4359     req->status = NVME_SUCCESS;
4360 
4361     status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
4362     if (status) {
4363         return status;
4364     }
4365 
4366     zra = dw13 & 0xff;
4367     if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
4368         return NVME_INVALID_FIELD | NVME_DNR;
4369     }
4370     if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
4371         return NVME_INVALID_FIELD | NVME_DNR;
4372     }
4373 
4374     zrasf = (dw13 >> 8) & 0xff;
4375     if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
4376         return NVME_INVALID_FIELD | NVME_DNR;
4377     }
4378 
4379     if (data_size < sizeof(NvmeZoneReportHeader)) {
4380         return NVME_INVALID_FIELD | NVME_DNR;
4381     }
4382 
4383     status = nvme_check_mdts(n, data_size);
4384     if (status) {
4385         return status;
4386     }
4387 
4388     partial = (dw13 >> 16) & 0x01;
4389 
4390     zone_entry_sz = sizeof(NvmeZoneDescr);
4391     if (zra == NVME_ZONE_REPORT_EXTENDED) {
4392         zone_entry_sz += ns->params.zd_extension_size;
4393     }
4394 
4395     max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
4396     buf = g_malloc0(data_size);
4397 
4398     zone = &ns->zone_array[zone_idx];
4399     for (i = zone_idx; i < ns->num_zones; i++) {
4400         if (partial && nr_zones >= max_zones) {
4401             break;
4402         }
4403         if (nvme_zone_matches_filter(zrasf, zone++)) {
4404             nr_zones++;
4405         }
4406     }
4407     header = buf;
4408     header->nr_zones = cpu_to_le64(nr_zones);
4409 
4410     buf_p = buf + sizeof(NvmeZoneReportHeader);
4411     for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
4412         zone = &ns->zone_array[zone_idx];
4413         if (nvme_zone_matches_filter(zrasf, zone)) {
4414             z = buf_p;
4415             buf_p += sizeof(NvmeZoneDescr);
4416 
4417             z->zt = zone->d.zt;
4418             z->zs = zone->d.zs;
4419             z->zcap = cpu_to_le64(zone->d.zcap);
4420             z->zslba = cpu_to_le64(zone->d.zslba);
4421             z->za = zone->d.za;
4422 
4423             if (nvme_wp_is_valid(zone)) {
4424                 z->wp = cpu_to_le64(zone->d.wp);
4425             } else {
4426                 z->wp = cpu_to_le64(~0ULL);
4427             }
4428 
4429             if (zra == NVME_ZONE_REPORT_EXTENDED) {
4430                 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
4431                     memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
4432                            ns->params.zd_extension_size);
4433                 }
4434                 buf_p += ns->params.zd_extension_size;
4435             }
4436 
4437             max_zones--;
4438         }
4439     }
4440 
4441     status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
4442 
4443     g_free(buf);
4444 
4445     return status;
4446 }
4447 
4448 static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
4449                                        size_t len)
4450 {
4451     NvmeNamespace *ns = req->ns;
4452     NvmeEnduranceGroup *endgrp;
4453     NvmeRuhStatus *hdr;
4454     NvmeRuhStatusDescr *ruhsd;
4455     unsigned int nruhsd;
4456     uint16_t rg, ph, *ruhid;
4457     size_t trans_len;
4458     g_autofree uint8_t *buf = NULL;
4459 
4460     if (!n->subsys) {
4461         return NVME_INVALID_FIELD | NVME_DNR;
4462     }
4463 
4464     if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) {
4465         return NVME_INVALID_NSID | NVME_DNR;
4466     }
4467 
4468     if (!n->subsys->endgrp.fdp.enabled) {
4469         return NVME_FDP_DISABLED | NVME_DNR;
4470     }
4471 
4472     endgrp = ns->endgrp;
4473 
4474     nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
4475     trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
4476     buf = g_malloc(trans_len);
4477 
4478     trans_len = MIN(trans_len, len);
4479 
4480     hdr = (NvmeRuhStatus *)buf;
4481     ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus));
4482 
4483     hdr->nruhsd = cpu_to_le16(nruhsd);
4484 
4485     ruhid = ns->fdp.phs;
4486 
4487     for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) {
4488         NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid];
4489 
4490         for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) {
4491             uint16_t pid = nvme_make_pid(ns, rg, ph);
4492 
4493             ruhsd->pid = cpu_to_le16(pid);
4494             ruhsd->ruhid = *ruhid;
4495             ruhsd->earutr = 0;
4496             ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw);
4497         }
4498     }
4499 
4500     return nvme_c2h(n, buf, trans_len, req);
4501 }
4502 
4503 static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4504 {
4505     NvmeCmd *cmd = &req->cmd;
4506     uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4507     uint32_t numd = le32_to_cpu(cmd->cdw11);
4508     uint8_t mo = (cdw10 & 0xff);
4509     size_t len = (numd + 1) << 2;
4510 
4511     switch (mo) {
4512     case NVME_IOMR_MO_NOP:
4513         return 0;
4514     case NVME_IOMR_MO_RUH_STATUS:
4515         return nvme_io_mgmt_recv_ruhs(n, req, len);
4516     default:
4517         return NVME_INVALID_FIELD | NVME_DNR;
4518     };
4519 }
4520 
4521 static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
4522 {
4523     NvmeCmd *cmd = &req->cmd;
4524     NvmeNamespace *ns = req->ns;
4525     uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4526     uint16_t ret = NVME_SUCCESS;
4527     uint32_t npid = (cdw10 >> 16) + 1;
4528     unsigned int i = 0;
4529     g_autofree uint16_t *pids = NULL;
4530     uint32_t maxnpid;
4531 
4532     if (!ns->endgrp || !ns->endgrp->fdp.enabled) {
4533         return NVME_FDP_DISABLED | NVME_DNR;
4534     }
4535 
4536     maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
4537 
4538     if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
4539         return NVME_INVALID_FIELD | NVME_DNR;
4540     }
4541 
4542     pids = g_new(uint16_t, npid);
4543 
4544     ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req);
4545     if (ret) {
4546         return ret;
4547     }
4548 
4549     for (; i < npid; i++) {
4550         if (!nvme_update_ruh(n, ns, pids[i])) {
4551             return NVME_INVALID_FIELD | NVME_DNR;
4552         }
4553     }
4554 
4555     return ret;
4556 }
4557 
4558 static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4559 {
4560     NvmeCmd *cmd = &req->cmd;
4561     uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4562     uint8_t mo = (cdw10 & 0xff);
4563 
4564     switch (mo) {
4565     case NVME_IOMS_MO_NOP:
4566         return 0;
4567     case NVME_IOMS_MO_RUH_UPDATE:
4568         return nvme_io_mgmt_send_ruh_update(n, req);
4569     default:
4570         return NVME_INVALID_FIELD | NVME_DNR;
4571     };
4572 }
4573 
4574 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
4575 {
4576     NvmeNamespace *ns;
4577     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4578 
4579     trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
4580                           req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
4581 
4582     /*
4583      * In the base NVM command set, Flush may apply to all namespaces
4584      * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
4585      * along with TP 4056 (Namespace Types), it may be pretty screwed up.
4586      *
4587      * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
4588      * opcode with a specific command since we cannot determine a unique I/O
4589      * command set. Opcode 0h could have any other meaning than something
4590      * equivalent to flushing and say it DOES have completely different
4591      * semantics in some other command set - does an NSID of FFFFFFFFh then
4592      * mean "for all namespaces, apply whatever command set specific command
4593      * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
4594      * whatever command that uses the 0h opcode if, and only if, it allows NSID
4595      * to be FFFFFFFFh"?
4596      *
4597      * Anyway (and luckily), for now, we do not care about this since the
4598      * device only supports namespace types that includes the NVM Flush command
4599      * (NVM and Zoned), so always do an NVM Flush.
4600      */
4601 
4602     if (req->cmd.opcode == NVME_CMD_FLUSH) {
4603         return nvme_flush(n, req);
4604     }
4605 
4606     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4607         return NVME_INVALID_NSID | NVME_DNR;
4608     }
4609 
4610     ns = nvme_ns(n, nsid);
4611     if (unlikely(!ns)) {
4612         return NVME_INVALID_FIELD | NVME_DNR;
4613     }
4614 
4615     if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4616         trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4617         return NVME_INVALID_OPCODE | NVME_DNR;
4618     }
4619 
4620     if (ns->status) {
4621         return ns->status;
4622     }
4623 
4624     if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
4625         return NVME_INVALID_FIELD;
4626     }
4627 
4628     req->ns = ns;
4629 
4630     switch (req->cmd.opcode) {
4631     case NVME_CMD_WRITE_ZEROES:
4632         return nvme_write_zeroes(n, req);
4633     case NVME_CMD_ZONE_APPEND:
4634         return nvme_zone_append(n, req);
4635     case NVME_CMD_WRITE:
4636         return nvme_write(n, req);
4637     case NVME_CMD_READ:
4638         return nvme_read(n, req);
4639     case NVME_CMD_COMPARE:
4640         return nvme_compare(n, req);
4641     case NVME_CMD_DSM:
4642         return nvme_dsm(n, req);
4643     case NVME_CMD_VERIFY:
4644         return nvme_verify(n, req);
4645     case NVME_CMD_COPY:
4646         return nvme_copy(n, req);
4647     case NVME_CMD_ZONE_MGMT_SEND:
4648         return nvme_zone_mgmt_send(n, req);
4649     case NVME_CMD_ZONE_MGMT_RECV:
4650         return nvme_zone_mgmt_recv(n, req);
4651     case NVME_CMD_IO_MGMT_RECV:
4652         return nvme_io_mgmt_recv(n, req);
4653     case NVME_CMD_IO_MGMT_SEND:
4654         return nvme_io_mgmt_send(n, req);
4655     default:
4656         assert(false);
4657     }
4658 
4659     return NVME_INVALID_OPCODE | NVME_DNR;
4660 }
4661 
4662 static void nvme_cq_notifier(EventNotifier *e)
4663 {
4664     NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
4665     NvmeCtrl *n = cq->ctrl;
4666 
4667     if (!event_notifier_test_and_clear(e)) {
4668         return;
4669     }
4670 
4671     nvme_update_cq_head(cq);
4672 
4673     if (cq->tail == cq->head) {
4674         if (cq->irq_enabled) {
4675             n->cq_pending--;
4676         }
4677 
4678         nvme_irq_deassert(n, cq);
4679     }
4680 
4681     qemu_bh_schedule(cq->bh);
4682 }
4683 
4684 static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
4685 {
4686     NvmeCtrl *n = cq->ctrl;
4687     uint16_t offset = (cq->cqid << 3) + (1 << 2);
4688     int ret;
4689 
4690     ret = event_notifier_init(&cq->notifier, 0);
4691     if (ret < 0) {
4692         return ret;
4693     }
4694 
4695     event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
4696     memory_region_add_eventfd(&n->iomem,
4697                               0x1000 + offset, 4, false, 0, &cq->notifier);
4698 
4699     return 0;
4700 }
4701 
4702 static void nvme_sq_notifier(EventNotifier *e)
4703 {
4704     NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
4705 
4706     if (!event_notifier_test_and_clear(e)) {
4707         return;
4708     }
4709 
4710     nvme_process_sq(sq);
4711 }
4712 
4713 static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
4714 {
4715     NvmeCtrl *n = sq->ctrl;
4716     uint16_t offset = sq->sqid << 3;
4717     int ret;
4718 
4719     ret = event_notifier_init(&sq->notifier, 0);
4720     if (ret < 0) {
4721         return ret;
4722     }
4723 
4724     event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
4725     memory_region_add_eventfd(&n->iomem,
4726                               0x1000 + offset, 4, false, 0, &sq->notifier);
4727 
4728     return 0;
4729 }
4730 
4731 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
4732 {
4733     uint16_t offset = sq->sqid << 3;
4734 
4735     n->sq[sq->sqid] = NULL;
4736     qemu_bh_delete(sq->bh);
4737     if (sq->ioeventfd_enabled) {
4738         memory_region_del_eventfd(&n->iomem,
4739                                   0x1000 + offset, 4, false, 0, &sq->notifier);
4740         event_notifier_set_handler(&sq->notifier, NULL);
4741         event_notifier_cleanup(&sq->notifier);
4742     }
4743     g_free(sq->io_req);
4744     if (sq->sqid) {
4745         g_free(sq);
4746     }
4747 }
4748 
4749 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
4750 {
4751     NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4752     NvmeRequest *r, *next;
4753     NvmeSQueue *sq;
4754     NvmeCQueue *cq;
4755     uint16_t qid = le16_to_cpu(c->qid);
4756 
4757     if (unlikely(!qid || nvme_check_sqid(n, qid))) {
4758         trace_pci_nvme_err_invalid_del_sq(qid);
4759         return NVME_INVALID_QID | NVME_DNR;
4760     }
4761 
4762     trace_pci_nvme_del_sq(qid);
4763 
4764     sq = n->sq[qid];
4765     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
4766         r = QTAILQ_FIRST(&sq->out_req_list);
4767         assert(r->aiocb);
4768         blk_aio_cancel(r->aiocb);
4769     }
4770 
4771     assert(QTAILQ_EMPTY(&sq->out_req_list));
4772 
4773     if (!nvme_check_cqid(n, sq->cqid)) {
4774         cq = n->cq[sq->cqid];
4775         QTAILQ_REMOVE(&cq->sq_list, sq, entry);
4776 
4777         nvme_post_cqes(cq);
4778         QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
4779             if (r->sq == sq) {
4780                 QTAILQ_REMOVE(&cq->req_list, r, entry);
4781                 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
4782             }
4783         }
4784     }
4785 
4786     nvme_free_sq(sq, n);
4787     return NVME_SUCCESS;
4788 }
4789 
4790 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
4791                          uint16_t sqid, uint16_t cqid, uint16_t size)
4792 {
4793     int i;
4794     NvmeCQueue *cq;
4795 
4796     sq->ctrl = n;
4797     sq->dma_addr = dma_addr;
4798     sq->sqid = sqid;
4799     sq->size = size;
4800     sq->cqid = cqid;
4801     sq->head = sq->tail = 0;
4802     sq->io_req = g_new0(NvmeRequest, sq->size);
4803 
4804     QTAILQ_INIT(&sq->req_list);
4805     QTAILQ_INIT(&sq->out_req_list);
4806     for (i = 0; i < sq->size; i++) {
4807         sq->io_req[i].sq = sq;
4808         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4809     }
4810 
4811     sq->bh = qemu_bh_new_guarded(nvme_process_sq, sq,
4812                                  &DEVICE(sq->ctrl)->mem_reentrancy_guard);
4813 
4814     if (n->dbbuf_enabled) {
4815         sq->db_addr = n->dbbuf_dbs + (sqid << 3);
4816         sq->ei_addr = n->dbbuf_eis + (sqid << 3);
4817 
4818         if (n->params.ioeventfd && sq->sqid != 0) {
4819             if (!nvme_init_sq_ioeventfd(sq)) {
4820                 sq->ioeventfd_enabled = true;
4821             }
4822         }
4823     }
4824 
4825     assert(n->cq[cqid]);
4826     cq = n->cq[cqid];
4827     QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4828     n->sq[sqid] = sq;
4829 }
4830 
4831 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4832 {
4833     NvmeSQueue *sq;
4834     NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4835 
4836     uint16_t cqid = le16_to_cpu(c->cqid);
4837     uint16_t sqid = le16_to_cpu(c->sqid);
4838     uint16_t qsize = le16_to_cpu(c->qsize);
4839     uint16_t qflags = le16_to_cpu(c->sq_flags);
4840     uint64_t prp1 = le64_to_cpu(c->prp1);
4841 
4842     trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4843 
4844     if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4845         trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4846         return NVME_INVALID_CQID | NVME_DNR;
4847     }
4848     if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
4849         trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4850         return NVME_INVALID_QID | NVME_DNR;
4851     }
4852     if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4853         trace_pci_nvme_err_invalid_create_sq_size(qsize);
4854         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4855     }
4856     if (unlikely(prp1 & (n->page_size - 1))) {
4857         trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4858         return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4859     }
4860     if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4861         trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4862         return NVME_INVALID_FIELD | NVME_DNR;
4863     }
4864     sq = g_malloc0(sizeof(*sq));
4865     nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4866     return NVME_SUCCESS;
4867 }
4868 
4869 struct nvme_stats {
4870     uint64_t units_read;
4871     uint64_t units_written;
4872     uint64_t read_commands;
4873     uint64_t write_commands;
4874 };
4875 
4876 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4877 {
4878     BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4879 
4880     stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
4881     stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
4882     stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4883     stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4884 }
4885 
4886 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4887                                 uint64_t off, NvmeRequest *req)
4888 {
4889     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4890     struct nvme_stats stats = { 0 };
4891     NvmeSmartLog smart = { 0 };
4892     uint32_t trans_len;
4893     NvmeNamespace *ns;
4894     time_t current_ms;
4895     uint64_t u_read, u_written;
4896 
4897     if (off >= sizeof(smart)) {
4898         return NVME_INVALID_FIELD | NVME_DNR;
4899     }
4900 
4901     if (nsid != 0xffffffff) {
4902         ns = nvme_ns(n, nsid);
4903         if (!ns) {
4904             return NVME_INVALID_NSID | NVME_DNR;
4905         }
4906         nvme_set_blk_stats(ns, &stats);
4907     } else {
4908         int i;
4909 
4910         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4911             ns = nvme_ns(n, i);
4912             if (!ns) {
4913                 continue;
4914             }
4915             nvme_set_blk_stats(ns, &stats);
4916         }
4917     }
4918 
4919     trans_len = MIN(sizeof(smart) - off, buf_len);
4920     smart.critical_warning = n->smart_critical_warning;
4921 
4922     u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
4923     u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
4924 
4925     smart.data_units_read[0] = cpu_to_le64(u_read);
4926     smart.data_units_written[0] = cpu_to_le64(u_written);
4927     smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4928     smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4929 
4930     smart.temperature = cpu_to_le16(n->temperature);
4931 
4932     if ((n->temperature >= n->features.temp_thresh_hi) ||
4933         (n->temperature <= n->features.temp_thresh_low)) {
4934         smart.critical_warning |= NVME_SMART_TEMPERATURE;
4935     }
4936 
4937     current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4938     smart.power_on_hours[0] =
4939         cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4940 
4941     if (!rae) {
4942         nvme_clear_events(n, NVME_AER_TYPE_SMART);
4943     }
4944 
4945     return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4946 }
4947 
4948 static uint16_t nvme_endgrp_info(NvmeCtrl *n,  uint8_t rae, uint32_t buf_len,
4949                                  uint64_t off, NvmeRequest *req)
4950 {
4951     uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
4952     uint16_t endgrpid = (dw11 >> 16) & 0xffff;
4953     struct nvme_stats stats = {};
4954     NvmeEndGrpLog info = {};
4955     int i;
4956 
4957     if (!n->subsys || endgrpid != 0x1) {
4958         return NVME_INVALID_FIELD | NVME_DNR;
4959     }
4960 
4961     if (off >= sizeof(info)) {
4962         return NVME_INVALID_FIELD | NVME_DNR;
4963     }
4964 
4965     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4966         NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
4967         if (!ns) {
4968             continue;
4969         }
4970 
4971         nvme_set_blk_stats(ns, &stats);
4972     }
4973 
4974     info.data_units_read[0] =
4975         cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000));
4976     info.data_units_written[0] =
4977         cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
4978     info.media_units_written[0] =
4979         cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
4980 
4981     info.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4982     info.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4983 
4984     buf_len = MIN(sizeof(info) - off, buf_len);
4985 
4986     return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req);
4987 }
4988 
4989 
4990 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4991                                  NvmeRequest *req)
4992 {
4993     uint32_t trans_len;
4994     NvmeFwSlotInfoLog fw_log = {
4995         .afi = 0x1,
4996     };
4997 
4998     if (off >= sizeof(fw_log)) {
4999         return NVME_INVALID_FIELD | NVME_DNR;
5000     }
5001 
5002     strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
5003     trans_len = MIN(sizeof(fw_log) - off, buf_len);
5004 
5005     return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
5006 }
5007 
5008 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5009                                 uint64_t off, NvmeRequest *req)
5010 {
5011     uint32_t trans_len;
5012     NvmeErrorLog errlog;
5013 
5014     if (off >= sizeof(errlog)) {
5015         return NVME_INVALID_FIELD | NVME_DNR;
5016     }
5017 
5018     if (!rae) {
5019         nvme_clear_events(n, NVME_AER_TYPE_ERROR);
5020     }
5021 
5022     memset(&errlog, 0x0, sizeof(errlog));
5023     trans_len = MIN(sizeof(errlog) - off, buf_len);
5024 
5025     return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
5026 }
5027 
5028 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5029                                     uint64_t off, NvmeRequest *req)
5030 {
5031     uint32_t nslist[1024];
5032     uint32_t trans_len;
5033     int i = 0;
5034     uint32_t nsid;
5035 
5036     if (off >= sizeof(nslist)) {
5037         trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
5038         return NVME_INVALID_FIELD | NVME_DNR;
5039     }
5040 
5041     memset(nslist, 0x0, sizeof(nslist));
5042     trans_len = MIN(sizeof(nslist) - off, buf_len);
5043 
5044     while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
5045             NVME_CHANGED_NSID_SIZE) {
5046         /*
5047          * If more than 1024 namespaces, the first entry in the log page should
5048          * be set to FFFFFFFFh and the others to 0 as spec.
5049          */
5050         if (i == ARRAY_SIZE(nslist)) {
5051             memset(nslist, 0x0, sizeof(nslist));
5052             nslist[0] = 0xffffffff;
5053             break;
5054         }
5055 
5056         nslist[i++] = nsid;
5057         clear_bit(nsid, n->changed_nsids);
5058     }
5059 
5060     /*
5061      * Remove all the remaining list entries in case returns directly due to
5062      * more than 1024 namespaces.
5063      */
5064     if (nslist[0] == 0xffffffff) {
5065         bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
5066     }
5067 
5068     if (!rae) {
5069         nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
5070     }
5071 
5072     return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
5073 }
5074 
5075 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
5076                                  uint64_t off, NvmeRequest *req)
5077 {
5078     NvmeEffectsLog log = {};
5079     const uint32_t *src_iocs = NULL;
5080     uint32_t trans_len;
5081 
5082     if (off >= sizeof(log)) {
5083         trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
5084         return NVME_INVALID_FIELD | NVME_DNR;
5085     }
5086 
5087     switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
5088     case NVME_CC_CSS_NVM:
5089         src_iocs = nvme_cse_iocs_nvm;
5090         /* fall through */
5091     case NVME_CC_CSS_ADMIN_ONLY:
5092         break;
5093     case NVME_CC_CSS_CSI:
5094         switch (csi) {
5095         case NVME_CSI_NVM:
5096             src_iocs = nvme_cse_iocs_nvm;
5097             break;
5098         case NVME_CSI_ZONED:
5099             src_iocs = nvme_cse_iocs_zoned;
5100             break;
5101         }
5102     }
5103 
5104     memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
5105 
5106     if (src_iocs) {
5107         memcpy(log.iocs, src_iocs, sizeof(log.iocs));
5108     }
5109 
5110     trans_len = MIN(sizeof(log) - off, buf_len);
5111 
5112     return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
5113 }
5114 
5115 static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss)
5116 {
5117     size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr)
5118                        + vss;
5119     return ROUND_UP(entry_siz, 8);
5120 }
5121 
5122 static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5123                                uint64_t off, NvmeRequest *req)
5124 {
5125     uint32_t log_size, trans_len;
5126     g_autofree uint8_t *buf = NULL;
5127     NvmeFdpDescrHdr *hdr;
5128     NvmeRuhDescr *ruhd;
5129     NvmeEnduranceGroup *endgrp;
5130     NvmeFdpConfsHdr *log;
5131     size_t nruh, fdp_descr_size;
5132     int i;
5133 
5134     if (endgrpid != 1 || !n->subsys) {
5135         return NVME_INVALID_FIELD | NVME_DNR;
5136     }
5137 
5138     endgrp = &n->subsys->endgrp;
5139 
5140     if (endgrp->fdp.enabled) {
5141         nruh = endgrp->fdp.nruh;
5142     } else {
5143         nruh = 1;
5144     }
5145 
5146     fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS);
5147     log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size;
5148 
5149     if (off >= log_size) {
5150         return NVME_INVALID_FIELD | NVME_DNR;
5151     }
5152 
5153     trans_len = MIN(log_size - off, buf_len);
5154 
5155     buf = g_malloc0(log_size);
5156     log = (NvmeFdpConfsHdr *)buf;
5157     hdr = (NvmeFdpDescrHdr *)(log + 1);
5158     ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr));
5159 
5160     log->num_confs = cpu_to_le16(0);
5161     log->size = cpu_to_le32(log_size);
5162 
5163     hdr->descr_size = cpu_to_le16(fdp_descr_size);
5164     if (endgrp->fdp.enabled) {
5165         hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1);
5166         hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif);
5167         hdr->nrg = cpu_to_le16(endgrp->fdp.nrg);
5168         hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5169         hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5170         hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES);
5171         hdr->runs = cpu_to_le64(endgrp->fdp.runs);
5172 
5173         for (i = 0; i < nruh; i++) {
5174             ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5175             ruhd++;
5176         }
5177     } else {
5178         /* 1 bit for RUH in PIF -> 2 RUHs max. */
5179         hdr->nrg = cpu_to_le16(1);
5180         hdr->nruh = cpu_to_le16(1);
5181         hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5182         hdr->nnss = cpu_to_le32(1);
5183         hdr->runs = cpu_to_le64(96 * MiB);
5184 
5185         ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5186     }
5187 
5188     return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5189 }
5190 
5191 static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid,
5192                                    uint32_t dw10, uint32_t dw12,
5193                                    uint32_t buf_len, uint64_t off,
5194                                    NvmeRequest *req)
5195 {
5196     NvmeRuHandle *ruh;
5197     NvmeRuhuLog *hdr;
5198     NvmeRuhuDescr *ruhud;
5199     NvmeEnduranceGroup *endgrp;
5200     g_autofree uint8_t *buf = NULL;
5201     uint32_t log_size, trans_len;
5202     uint16_t i;
5203 
5204     if (endgrpid != 1 || !n->subsys) {
5205         return NVME_INVALID_FIELD | NVME_DNR;
5206     }
5207 
5208     endgrp = &n->subsys->endgrp;
5209 
5210     if (!endgrp->fdp.enabled) {
5211         return NVME_FDP_DISABLED | NVME_DNR;
5212     }
5213 
5214     log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr);
5215 
5216     if (off >= log_size) {
5217         return NVME_INVALID_FIELD | NVME_DNR;
5218     }
5219 
5220     trans_len = MIN(log_size - off, buf_len);
5221 
5222     buf = g_malloc0(log_size);
5223     hdr = (NvmeRuhuLog *)buf;
5224     ruhud = (NvmeRuhuDescr *)(hdr + 1);
5225 
5226     ruh = endgrp->fdp.ruhs;
5227     hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5228 
5229     for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) {
5230         ruhud->ruha = ruh->ruha;
5231     }
5232 
5233     return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5234 }
5235 
5236 static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5237                                uint64_t off, NvmeRequest *req)
5238 {
5239     NvmeEnduranceGroup *endgrp;
5240     NvmeFdpStatsLog log = {};
5241     uint32_t trans_len;
5242 
5243     if (off >= sizeof(NvmeFdpStatsLog)) {
5244         return NVME_INVALID_FIELD | NVME_DNR;
5245     }
5246 
5247     if (endgrpid != 1 || !n->subsys) {
5248         return NVME_INVALID_FIELD | NVME_DNR;
5249     }
5250 
5251     if (!n->subsys->endgrp.fdp.enabled) {
5252         return NVME_FDP_DISABLED | NVME_DNR;
5253     }
5254 
5255     endgrp = &n->subsys->endgrp;
5256 
5257     trans_len = MIN(sizeof(log) - off, buf_len);
5258 
5259     /* spec value is 128 bit, we only use 64 bit */
5260     log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw);
5261     log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw);
5262     log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe);
5263 
5264     return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req);
5265 }
5266 
5267 static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid,
5268                                 uint32_t buf_len, uint64_t off,
5269                                 NvmeRequest *req)
5270 {
5271     NvmeEnduranceGroup *endgrp;
5272     NvmeCmd *cmd = &req->cmd;
5273     bool host_events = (cmd->cdw10 >> 8) & 0x1;
5274     uint32_t log_size, trans_len;
5275     NvmeFdpEventBuffer *ebuf;
5276     g_autofree NvmeFdpEventsLog *elog = NULL;
5277     NvmeFdpEvent *event;
5278 
5279     if (endgrpid != 1 || !n->subsys) {
5280         return NVME_INVALID_FIELD | NVME_DNR;
5281     }
5282 
5283     endgrp = &n->subsys->endgrp;
5284 
5285     if (!endgrp->fdp.enabled) {
5286         return NVME_FDP_DISABLED | NVME_DNR;
5287     }
5288 
5289     if (host_events) {
5290         ebuf = &endgrp->fdp.host_events;
5291     } else {
5292         ebuf = &endgrp->fdp.ctrl_events;
5293     }
5294 
5295     log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
5296 
5297     if (off >= log_size) {
5298         return NVME_INVALID_FIELD | NVME_DNR;
5299     }
5300 
5301     trans_len = MIN(log_size - off, buf_len);
5302     elog = g_malloc0(log_size);
5303     elog->num_events = cpu_to_le32(ebuf->nelems);
5304     event = (NvmeFdpEvent *)(elog + 1);
5305 
5306     if (ebuf->nelems && ebuf->start == ebuf->next) {
5307         unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start);
5308         /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */
5309         memcpy(event, &ebuf->events[ebuf->start],
5310                sizeof(NvmeFdpEvent) * nelems);
5311         memcpy(event + nelems, ebuf->events,
5312                sizeof(NvmeFdpEvent) * ebuf->next);
5313     } else if (ebuf->start < ebuf->next) {
5314         memcpy(event, &ebuf->events[ebuf->start],
5315                sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start));
5316     }
5317 
5318     return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req);
5319 }
5320 
5321 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
5322 {
5323     NvmeCmd *cmd = &req->cmd;
5324 
5325     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5326     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5327     uint32_t dw12 = le32_to_cpu(cmd->cdw12);
5328     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
5329     uint8_t  lid = dw10 & 0xff;
5330     uint8_t  lsp = (dw10 >> 8) & 0xf;
5331     uint8_t  rae = (dw10 >> 15) & 0x1;
5332     uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
5333     uint32_t numdl, numdu, lspi;
5334     uint64_t off, lpol, lpou;
5335     size_t   len;
5336     uint16_t status;
5337 
5338     numdl = (dw10 >> 16);
5339     numdu = (dw11 & 0xffff);
5340     lspi = (dw11 >> 16);
5341     lpol = dw12;
5342     lpou = dw13;
5343 
5344     len = (((numdu << 16) | numdl) + 1) << 2;
5345     off = (lpou << 32ULL) | lpol;
5346 
5347     if (off & 0x3) {
5348         return NVME_INVALID_FIELD | NVME_DNR;
5349     }
5350 
5351     trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
5352 
5353     status = nvme_check_mdts(n, len);
5354     if (status) {
5355         return status;
5356     }
5357 
5358     switch (lid) {
5359     case NVME_LOG_ERROR_INFO:
5360         return nvme_error_info(n, rae, len, off, req);
5361     case NVME_LOG_SMART_INFO:
5362         return nvme_smart_info(n, rae, len, off, req);
5363     case NVME_LOG_FW_SLOT_INFO:
5364         return nvme_fw_log_info(n, len, off, req);
5365     case NVME_LOG_CHANGED_NSLIST:
5366         return nvme_changed_nslist(n, rae, len, off, req);
5367     case NVME_LOG_CMD_EFFECTS:
5368         return nvme_cmd_effects(n, csi, len, off, req);
5369     case NVME_LOG_ENDGRP:
5370         return nvme_endgrp_info(n, rae, len, off, req);
5371     case NVME_LOG_FDP_CONFS:
5372         return nvme_fdp_confs(n, lspi, len, off, req);
5373     case NVME_LOG_FDP_RUH_USAGE:
5374         return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req);
5375     case NVME_LOG_FDP_STATS:
5376         return nvme_fdp_stats(n, lspi, len, off, req);
5377     case NVME_LOG_FDP_EVENTS:
5378         return nvme_fdp_events(n, lspi, len, off, req);
5379     default:
5380         trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
5381         return NVME_INVALID_FIELD | NVME_DNR;
5382     }
5383 }
5384 
5385 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
5386 {
5387     PCIDevice *pci = PCI_DEVICE(n);
5388     uint16_t offset = (cq->cqid << 3) + (1 << 2);
5389 
5390     n->cq[cq->cqid] = NULL;
5391     qemu_bh_delete(cq->bh);
5392     if (cq->ioeventfd_enabled) {
5393         memory_region_del_eventfd(&n->iomem,
5394                                   0x1000 + offset, 4, false, 0, &cq->notifier);
5395         event_notifier_set_handler(&cq->notifier, NULL);
5396         event_notifier_cleanup(&cq->notifier);
5397     }
5398     if (msix_enabled(pci)) {
5399         msix_vector_unuse(pci, cq->vector);
5400     }
5401     if (cq->cqid) {
5402         g_free(cq);
5403     }
5404 }
5405 
5406 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
5407 {
5408     NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
5409     NvmeCQueue *cq;
5410     uint16_t qid = le16_to_cpu(c->qid);
5411 
5412     if (unlikely(!qid || nvme_check_cqid(n, qid))) {
5413         trace_pci_nvme_err_invalid_del_cq_cqid(qid);
5414         return NVME_INVALID_CQID | NVME_DNR;
5415     }
5416 
5417     cq = n->cq[qid];
5418     if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
5419         trace_pci_nvme_err_invalid_del_cq_notempty(qid);
5420         return NVME_INVALID_QUEUE_DEL;
5421     }
5422 
5423     if (cq->irq_enabled && cq->tail != cq->head) {
5424         n->cq_pending--;
5425     }
5426 
5427     nvme_irq_deassert(n, cq);
5428     trace_pci_nvme_del_cq(qid);
5429     nvme_free_cq(cq, n);
5430     return NVME_SUCCESS;
5431 }
5432 
5433 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
5434                          uint16_t cqid, uint16_t vector, uint16_t size,
5435                          uint16_t irq_enabled)
5436 {
5437     PCIDevice *pci = PCI_DEVICE(n);
5438 
5439     if (msix_enabled(pci)) {
5440         msix_vector_use(pci, vector);
5441     }
5442     cq->ctrl = n;
5443     cq->cqid = cqid;
5444     cq->size = size;
5445     cq->dma_addr = dma_addr;
5446     cq->phase = 1;
5447     cq->irq_enabled = irq_enabled;
5448     cq->vector = vector;
5449     cq->head = cq->tail = 0;
5450     QTAILQ_INIT(&cq->req_list);
5451     QTAILQ_INIT(&cq->sq_list);
5452     if (n->dbbuf_enabled) {
5453         cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
5454         cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
5455 
5456         if (n->params.ioeventfd && cqid != 0) {
5457             if (!nvme_init_cq_ioeventfd(cq)) {
5458                 cq->ioeventfd_enabled = true;
5459             }
5460         }
5461     }
5462     n->cq[cqid] = cq;
5463     cq->bh = qemu_bh_new_guarded(nvme_post_cqes, cq,
5464                                  &DEVICE(cq->ctrl)->mem_reentrancy_guard);
5465 }
5466 
5467 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
5468 {
5469     NvmeCQueue *cq;
5470     NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
5471     uint16_t cqid = le16_to_cpu(c->cqid);
5472     uint16_t vector = le16_to_cpu(c->irq_vector);
5473     uint16_t qsize = le16_to_cpu(c->qsize);
5474     uint16_t qflags = le16_to_cpu(c->cq_flags);
5475     uint64_t prp1 = le64_to_cpu(c->prp1);
5476     uint32_t cc = ldq_le_p(&n->bar.cc);
5477     uint8_t iocqes = NVME_CC_IOCQES(cc);
5478     uint8_t iosqes = NVME_CC_IOSQES(cc);
5479 
5480     trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
5481                              NVME_CQ_FLAGS_IEN(qflags) != 0);
5482 
5483     if (iosqes != NVME_SQES || iocqes != NVME_CQES) {
5484         trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes);
5485         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5486     }
5487 
5488     if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
5489         trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
5490         return NVME_INVALID_QID | NVME_DNR;
5491     }
5492     if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
5493         trace_pci_nvme_err_invalid_create_cq_size(qsize);
5494         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5495     }
5496     if (unlikely(prp1 & (n->page_size - 1))) {
5497         trace_pci_nvme_err_invalid_create_cq_addr(prp1);
5498         return NVME_INVALID_PRP_OFFSET | NVME_DNR;
5499     }
5500     if (unlikely(!msix_enabled(PCI_DEVICE(n)) && vector)) {
5501         trace_pci_nvme_err_invalid_create_cq_vector(vector);
5502         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5503     }
5504     if (unlikely(vector >= n->conf_msix_qsize)) {
5505         trace_pci_nvme_err_invalid_create_cq_vector(vector);
5506         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5507     }
5508     if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
5509         trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
5510         return NVME_INVALID_FIELD | NVME_DNR;
5511     }
5512 
5513     cq = g_malloc0(sizeof(*cq));
5514     nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
5515                  NVME_CQ_FLAGS_IEN(qflags));
5516 
5517     /*
5518      * It is only required to set qs_created when creating a completion queue;
5519      * creating a submission queue without a matching completion queue will
5520      * fail.
5521      */
5522     n->qs_created = true;
5523     return NVME_SUCCESS;
5524 }
5525 
5526 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
5527 {
5528     uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5529 
5530     return nvme_c2h(n, id, sizeof(id), req);
5531 }
5532 
5533 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
5534 {
5535     trace_pci_nvme_identify_ctrl();
5536 
5537     return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
5538 }
5539 
5540 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
5541 {
5542     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5543     uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5544     NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
5545 
5546     trace_pci_nvme_identify_ctrl_csi(c->csi);
5547 
5548     switch (c->csi) {
5549     case NVME_CSI_NVM:
5550         id_nvm->vsl = n->params.vsl;
5551         id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
5552         break;
5553 
5554     case NVME_CSI_ZONED:
5555         ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
5556         break;
5557 
5558     default:
5559         return NVME_INVALID_FIELD | NVME_DNR;
5560     }
5561 
5562     return nvme_c2h(n, id, sizeof(id), req);
5563 }
5564 
5565 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
5566 {
5567     NvmeNamespace *ns;
5568     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5569     uint32_t nsid = le32_to_cpu(c->nsid);
5570 
5571     trace_pci_nvme_identify_ns(nsid);
5572 
5573     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5574         return NVME_INVALID_NSID | NVME_DNR;
5575     }
5576 
5577     ns = nvme_ns(n, nsid);
5578     if (unlikely(!ns)) {
5579         if (!active) {
5580             ns = nvme_subsys_ns(n->subsys, nsid);
5581             if (!ns) {
5582                 return nvme_rpt_empty_id_struct(n, req);
5583             }
5584         } else {
5585             return nvme_rpt_empty_id_struct(n, req);
5586         }
5587     }
5588 
5589     if (active || ns->csi == NVME_CSI_NVM) {
5590         return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
5591     }
5592 
5593     return NVME_INVALID_CMD_SET | NVME_DNR;
5594 }
5595 
5596 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
5597                                         bool attached)
5598 {
5599     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5600     uint32_t nsid = le32_to_cpu(c->nsid);
5601     uint16_t min_id = le16_to_cpu(c->ctrlid);
5602     uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5603     uint16_t *ids = &list[1];
5604     NvmeNamespace *ns;
5605     NvmeCtrl *ctrl;
5606     int cntlid, nr_ids = 0;
5607 
5608     trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
5609 
5610     if (!n->subsys) {
5611         return NVME_INVALID_FIELD | NVME_DNR;
5612     }
5613 
5614     if (attached) {
5615         if (nsid == NVME_NSID_BROADCAST) {
5616             return NVME_INVALID_FIELD | NVME_DNR;
5617         }
5618 
5619         ns = nvme_subsys_ns(n->subsys, nsid);
5620         if (!ns) {
5621             return NVME_INVALID_FIELD | NVME_DNR;
5622         }
5623     }
5624 
5625     for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
5626         ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
5627         if (!ctrl) {
5628             continue;
5629         }
5630 
5631         if (attached && !nvme_ns(ctrl, nsid)) {
5632             continue;
5633         }
5634 
5635         ids[nr_ids++] = cntlid;
5636     }
5637 
5638     list[0] = nr_ids;
5639 
5640     return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
5641 }
5642 
5643 static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
5644 {
5645     trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
5646 
5647     return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap,
5648                     sizeof(NvmePriCtrlCap), req);
5649 }
5650 
5651 static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
5652 {
5653     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5654     uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
5655     uint16_t min_id = le16_to_cpu(c->ctrlid);
5656     uint8_t num_sec_ctrl = n->nr_sec_ctrls;
5657     NvmeSecCtrlList list = {0};
5658     uint8_t i;
5659 
5660     for (i = 0; i < num_sec_ctrl; i++) {
5661         if (n->sec_ctrl_list[i].scid >= min_id) {
5662             list.numcntl = MIN(num_sec_ctrl - i, 127);
5663             memcpy(&list.sec, n->sec_ctrl_list + i,
5664                    list.numcntl * sizeof(NvmeSecCtrlEntry));
5665             break;
5666         }
5667     }
5668 
5669     trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
5670 
5671     return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req);
5672 }
5673 
5674 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
5675                                      bool active)
5676 {
5677     NvmeNamespace *ns;
5678     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5679     uint32_t nsid = le32_to_cpu(c->nsid);
5680 
5681     trace_pci_nvme_identify_ns_csi(nsid, c->csi);
5682 
5683     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5684         return NVME_INVALID_NSID | NVME_DNR;
5685     }
5686 
5687     ns = nvme_ns(n, nsid);
5688     if (unlikely(!ns)) {
5689         if (!active) {
5690             ns = nvme_subsys_ns(n->subsys, nsid);
5691             if (!ns) {
5692                 return nvme_rpt_empty_id_struct(n, req);
5693             }
5694         } else {
5695             return nvme_rpt_empty_id_struct(n, req);
5696         }
5697     }
5698 
5699     if (c->csi == NVME_CSI_NVM) {
5700         return nvme_c2h(n, (uint8_t *)&ns->id_ns_nvm, sizeof(NvmeIdNsNvm),
5701                         req);
5702     } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
5703         return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
5704                         req);
5705     }
5706 
5707     return NVME_INVALID_FIELD | NVME_DNR;
5708 }
5709 
5710 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
5711                                      bool active)
5712 {
5713     NvmeNamespace *ns;
5714     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5715     uint32_t min_nsid = le32_to_cpu(c->nsid);
5716     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5717     static const int data_len = sizeof(list);
5718     uint32_t *list_ptr = (uint32_t *)list;
5719     int i, j = 0;
5720 
5721     trace_pci_nvme_identify_nslist(min_nsid);
5722 
5723     /*
5724      * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
5725      * since the Active Namespace ID List should return namespaces with ids
5726      * *higher* than the NSID specified in the command. This is also specified
5727      * in the spec (NVM Express v1.3d, Section 5.15.4).
5728      */
5729     if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5730         return NVME_INVALID_NSID | NVME_DNR;
5731     }
5732 
5733     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5734         ns = nvme_ns(n, i);
5735         if (!ns) {
5736             if (!active) {
5737                 ns = nvme_subsys_ns(n->subsys, i);
5738                 if (!ns) {
5739                     continue;
5740                 }
5741             } else {
5742                 continue;
5743             }
5744         }
5745         if (ns->params.nsid <= min_nsid) {
5746             continue;
5747         }
5748         list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5749         if (j == data_len / sizeof(uint32_t)) {
5750             break;
5751         }
5752     }
5753 
5754     return nvme_c2h(n, list, data_len, req);
5755 }
5756 
5757 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
5758                                          bool active)
5759 {
5760     NvmeNamespace *ns;
5761     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5762     uint32_t min_nsid = le32_to_cpu(c->nsid);
5763     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5764     static const int data_len = sizeof(list);
5765     uint32_t *list_ptr = (uint32_t *)list;
5766     int i, j = 0;
5767 
5768     trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
5769 
5770     /*
5771      * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
5772      */
5773     if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5774         return NVME_INVALID_NSID | NVME_DNR;
5775     }
5776 
5777     if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
5778         return NVME_INVALID_FIELD | NVME_DNR;
5779     }
5780 
5781     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5782         ns = nvme_ns(n, i);
5783         if (!ns) {
5784             if (!active) {
5785                 ns = nvme_subsys_ns(n->subsys, i);
5786                 if (!ns) {
5787                     continue;
5788                 }
5789             } else {
5790                 continue;
5791             }
5792         }
5793         if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
5794             continue;
5795         }
5796         list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5797         if (j == data_len / sizeof(uint32_t)) {
5798             break;
5799         }
5800     }
5801 
5802     return nvme_c2h(n, list, data_len, req);
5803 }
5804 
5805 static uint16_t nvme_endurance_group_list(NvmeCtrl *n, NvmeRequest *req)
5806 {
5807     uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5808     uint16_t *nr_ids = &list[0];
5809     uint16_t *ids = &list[1];
5810     uint16_t endgid = le32_to_cpu(req->cmd.cdw11) & 0xffff;
5811 
5812     /*
5813      * The current nvme-subsys only supports Endurance Group #1.
5814      */
5815     if (!endgid) {
5816         *nr_ids = 1;
5817         ids[0] = 1;
5818     } else {
5819         *nr_ids = 0;
5820     }
5821 
5822     return nvme_c2h(n, list, sizeof(list), req);
5823 }
5824 
5825 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
5826 {
5827     NvmeNamespace *ns;
5828     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5829     uint32_t nsid = le32_to_cpu(c->nsid);
5830     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5831     uint8_t *pos = list;
5832     struct {
5833         NvmeIdNsDescr hdr;
5834         uint8_t v[NVME_NIDL_UUID];
5835     } QEMU_PACKED uuid = {};
5836     struct {
5837         NvmeIdNsDescr hdr;
5838         uint8_t v[NVME_NIDL_NGUID];
5839     } QEMU_PACKED nguid = {};
5840     struct {
5841         NvmeIdNsDescr hdr;
5842         uint64_t v;
5843     } QEMU_PACKED eui64 = {};
5844     struct {
5845         NvmeIdNsDescr hdr;
5846         uint8_t v;
5847     } QEMU_PACKED csi = {};
5848 
5849     trace_pci_nvme_identify_ns_descr_list(nsid);
5850 
5851     if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5852         return NVME_INVALID_NSID | NVME_DNR;
5853     }
5854 
5855     ns = nvme_ns(n, nsid);
5856     if (unlikely(!ns)) {
5857         return NVME_INVALID_FIELD | NVME_DNR;
5858     }
5859 
5860     if (!qemu_uuid_is_null(&ns->params.uuid)) {
5861         uuid.hdr.nidt = NVME_NIDT_UUID;
5862         uuid.hdr.nidl = NVME_NIDL_UUID;
5863         memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
5864         memcpy(pos, &uuid, sizeof(uuid));
5865         pos += sizeof(uuid);
5866     }
5867 
5868     if (!nvme_nguid_is_null(&ns->params.nguid)) {
5869         nguid.hdr.nidt = NVME_NIDT_NGUID;
5870         nguid.hdr.nidl = NVME_NIDL_NGUID;
5871         memcpy(nguid.v, ns->params.nguid.data, NVME_NIDL_NGUID);
5872         memcpy(pos, &nguid, sizeof(nguid));
5873         pos += sizeof(nguid);
5874     }
5875 
5876     if (ns->params.eui64) {
5877         eui64.hdr.nidt = NVME_NIDT_EUI64;
5878         eui64.hdr.nidl = NVME_NIDL_EUI64;
5879         eui64.v = cpu_to_be64(ns->params.eui64);
5880         memcpy(pos, &eui64, sizeof(eui64));
5881         pos += sizeof(eui64);
5882     }
5883 
5884     csi.hdr.nidt = NVME_NIDT_CSI;
5885     csi.hdr.nidl = NVME_NIDL_CSI;
5886     csi.v = ns->csi;
5887     memcpy(pos, &csi, sizeof(csi));
5888     pos += sizeof(csi);
5889 
5890     return nvme_c2h(n, list, sizeof(list), req);
5891 }
5892 
5893 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
5894 {
5895     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5896     static const int data_len = sizeof(list);
5897 
5898     trace_pci_nvme_identify_cmd_set();
5899 
5900     NVME_SET_CSI(*list, NVME_CSI_NVM);
5901     NVME_SET_CSI(*list, NVME_CSI_ZONED);
5902 
5903     return nvme_c2h(n, list, data_len, req);
5904 }
5905 
5906 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
5907 {
5908     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5909 
5910     trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
5911                             c->csi);
5912 
5913     switch (c->cns) {
5914     case NVME_ID_CNS_NS:
5915         return nvme_identify_ns(n, req, true);
5916     case NVME_ID_CNS_NS_PRESENT:
5917         return nvme_identify_ns(n, req, false);
5918     case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
5919         return nvme_identify_ctrl_list(n, req, true);
5920     case NVME_ID_CNS_CTRL_LIST:
5921         return nvme_identify_ctrl_list(n, req, false);
5922     case NVME_ID_CNS_PRIMARY_CTRL_CAP:
5923         return nvme_identify_pri_ctrl_cap(n, req);
5924     case NVME_ID_CNS_SECONDARY_CTRL_LIST:
5925         return nvme_identify_sec_ctrl_list(n, req);
5926     case NVME_ID_CNS_CS_NS:
5927         return nvme_identify_ns_csi(n, req, true);
5928     case NVME_ID_CNS_CS_NS_PRESENT:
5929         return nvme_identify_ns_csi(n, req, false);
5930     case NVME_ID_CNS_CTRL:
5931         return nvme_identify_ctrl(n, req);
5932     case NVME_ID_CNS_CS_CTRL:
5933         return nvme_identify_ctrl_csi(n, req);
5934     case NVME_ID_CNS_NS_ACTIVE_LIST:
5935         return nvme_identify_nslist(n, req, true);
5936     case NVME_ID_CNS_NS_PRESENT_LIST:
5937         return nvme_identify_nslist(n, req, false);
5938     case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
5939         return nvme_identify_nslist_csi(n, req, true);
5940     case NVME_ID_CNS_ENDURANCE_GROUP_LIST:
5941         return nvme_endurance_group_list(n, req);
5942     case NVME_ID_CNS_CS_NS_PRESENT_LIST:
5943         return nvme_identify_nslist_csi(n, req, false);
5944     case NVME_ID_CNS_NS_DESCR_LIST:
5945         return nvme_identify_ns_descr_list(n, req);
5946     case NVME_ID_CNS_IO_COMMAND_SET:
5947         return nvme_identify_cmd_set(n, req);
5948     default:
5949         trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
5950         return NVME_INVALID_FIELD | NVME_DNR;
5951     }
5952 }
5953 
5954 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
5955 {
5956     uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
5957     uint16_t cid  = (le32_to_cpu(req->cmd.cdw10) >> 16) & 0xffff;
5958     NvmeSQueue *sq = n->sq[sqid];
5959     NvmeRequest *r, *next;
5960     int i;
5961 
5962     req->cqe.result = 1;
5963     if (nvme_check_sqid(n, sqid)) {
5964         return NVME_INVALID_FIELD | NVME_DNR;
5965     }
5966 
5967     if (sqid == 0) {
5968         for (i = 0; i < n->outstanding_aers; i++) {
5969             NvmeRequest *re = n->aer_reqs[i];
5970             if (re->cqe.cid == cid) {
5971                 memmove(n->aer_reqs + i, n->aer_reqs + i + 1,
5972                          (n->outstanding_aers - i - 1) * sizeof(NvmeRequest *));
5973                 n->outstanding_aers--;
5974                 re->status = NVME_CMD_ABORT_REQ;
5975                 req->cqe.result = 0;
5976                 nvme_enqueue_req_completion(&n->admin_cq, re);
5977                 return NVME_SUCCESS;
5978             }
5979         }
5980     }
5981 
5982     QTAILQ_FOREACH_SAFE(r, &sq->out_req_list, entry, next) {
5983         if (r->cqe.cid == cid) {
5984             if (r->aiocb) {
5985                 blk_aio_cancel_async(r->aiocb);
5986             }
5987             break;
5988         }
5989     }
5990 
5991     return NVME_SUCCESS;
5992 }
5993 
5994 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
5995 {
5996     trace_pci_nvme_setfeat_timestamp(ts);
5997 
5998     n->host_timestamp = le64_to_cpu(ts);
5999     n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6000 }
6001 
6002 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
6003 {
6004     uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6005     uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
6006 
6007     union nvme_timestamp {
6008         struct {
6009             uint64_t timestamp:48;
6010             uint64_t sync:1;
6011             uint64_t origin:3;
6012             uint64_t rsvd1:12;
6013         };
6014         uint64_t all;
6015     };
6016 
6017     union nvme_timestamp ts;
6018     ts.all = 0;
6019     ts.timestamp = n->host_timestamp + elapsed_time;
6020 
6021     /* If the host timestamp is non-zero, set the timestamp origin */
6022     ts.origin = n->host_timestamp ? 0x01 : 0x00;
6023 
6024     trace_pci_nvme_getfeat_timestamp(ts.all);
6025 
6026     return cpu_to_le64(ts.all);
6027 }
6028 
6029 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6030 {
6031     uint64_t timestamp = nvme_get_timestamp(n);
6032 
6033     return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
6034 }
6035 
6036 static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid,
6037                                 uint32_t *result)
6038 {
6039     *result = 0;
6040 
6041     if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6042         return NVME_INVALID_FIELD | NVME_DNR;
6043     }
6044 
6045     *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1);
6046     *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0);
6047 
6048     return NVME_SUCCESS;
6049 }
6050 
6051 static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6052                                             NvmeRequest *req, uint32_t *result)
6053 {
6054     NvmeCmd *cmd = &req->cmd;
6055     uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6056     uint16_t ph = cdw11 & 0xffff;
6057     uint8_t noet = (cdw11 >> 16) & 0xff;
6058     uint16_t ruhid, ret;
6059     uint32_t nentries = 0;
6060     uint8_t s_events_ndx = 0;
6061     size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet;
6062     g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz);
6063     NvmeRuHandle *ruh;
6064     NvmeFdpEventDescr *s_event;
6065 
6066     if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6067         return NVME_FDP_DISABLED | NVME_DNR;
6068     }
6069 
6070     if (!nvme_ph_valid(ns, ph)) {
6071         return NVME_INVALID_FIELD | NVME_DNR;
6072     }
6073 
6074     ruhid = ns->fdp.phs[ph];
6075     ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6076 
6077     assert(ruh);
6078 
6079     if (unlikely(noet == 0)) {
6080         return NVME_INVALID_FIELD | NVME_DNR;
6081     }
6082 
6083     for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) {
6084         uint8_t shift = nvme_fdp_evf_shifts[event_type];
6085         if (!shift && event_type) {
6086             /*
6087              * only first entry (event_type == 0) has a shift value of 0
6088              * other entries are simply unpopulated.
6089              */
6090             continue;
6091         }
6092 
6093         nentries++;
6094 
6095         s_event = &s_events[s_events_ndx];
6096         s_event->evt = event_type;
6097         s_event->evta = (ruh->event_filter >> shift) & 0x1;
6098 
6099         /* break if all `noet` entries are filled */
6100         if ((++s_events_ndx) == noet) {
6101             break;
6102         }
6103     }
6104 
6105     ret = nvme_c2h(n, s_events, s_events_siz, req);
6106     if (ret) {
6107         return ret;
6108     }
6109 
6110     *result = nentries;
6111     return NVME_SUCCESS;
6112 }
6113 
6114 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
6115 {
6116     NvmeCmd *cmd = &req->cmd;
6117     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6118     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6119     uint32_t nsid = le32_to_cpu(cmd->nsid);
6120     uint32_t result = 0;
6121     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6122     NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
6123     uint16_t iv;
6124     NvmeNamespace *ns;
6125     int i;
6126     uint16_t endgrpid = 0, ret = NVME_SUCCESS;
6127 
6128     static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
6129         [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
6130     };
6131 
6132     trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
6133 
6134     if (!nvme_feature_support[fid]) {
6135         return NVME_INVALID_FIELD | NVME_DNR;
6136     }
6137 
6138     if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6139         if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
6140             /*
6141              * The Reservation Notification Mask and Reservation Persistence
6142              * features require a status code of Invalid Field in Command when
6143              * NSID is FFFFFFFFh. Since the device does not support those
6144              * features we can always return Invalid Namespace or Format as we
6145              * should do for all other features.
6146              */
6147             return NVME_INVALID_NSID | NVME_DNR;
6148         }
6149 
6150         if (!nvme_ns(n, nsid)) {
6151             return NVME_INVALID_FIELD | NVME_DNR;
6152         }
6153     }
6154 
6155     switch (sel) {
6156     case NVME_GETFEAT_SELECT_CURRENT:
6157         break;
6158     case NVME_GETFEAT_SELECT_SAVED:
6159         /* no features are saveable by the controller; fallthrough */
6160     case NVME_GETFEAT_SELECT_DEFAULT:
6161         goto defaults;
6162     case NVME_GETFEAT_SELECT_CAP:
6163         result = nvme_feature_cap[fid];
6164         goto out;
6165     }
6166 
6167     switch (fid) {
6168     case NVME_TEMPERATURE_THRESHOLD:
6169         result = 0;
6170 
6171         /*
6172          * The controller only implements the Composite Temperature sensor, so
6173          * return 0 for all other sensors.
6174          */
6175         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6176             goto out;
6177         }
6178 
6179         switch (NVME_TEMP_THSEL(dw11)) {
6180         case NVME_TEMP_THSEL_OVER:
6181             result = n->features.temp_thresh_hi;
6182             goto out;
6183         case NVME_TEMP_THSEL_UNDER:
6184             result = n->features.temp_thresh_low;
6185             goto out;
6186         }
6187 
6188         return NVME_INVALID_FIELD | NVME_DNR;
6189     case NVME_ERROR_RECOVERY:
6190         if (!nvme_nsid_valid(n, nsid)) {
6191             return NVME_INVALID_NSID | NVME_DNR;
6192         }
6193 
6194         ns = nvme_ns(n, nsid);
6195         if (unlikely(!ns)) {
6196             return NVME_INVALID_FIELD | NVME_DNR;
6197         }
6198 
6199         result = ns->features.err_rec;
6200         goto out;
6201     case NVME_VOLATILE_WRITE_CACHE:
6202         result = 0;
6203         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6204             ns = nvme_ns(n, i);
6205             if (!ns) {
6206                 continue;
6207             }
6208 
6209             result = blk_enable_write_cache(ns->blkconf.blk);
6210             if (result) {
6211                 break;
6212             }
6213         }
6214         trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
6215         goto out;
6216     case NVME_ASYNCHRONOUS_EVENT_CONF:
6217         result = n->features.async_config;
6218         goto out;
6219     case NVME_TIMESTAMP:
6220         return nvme_get_feature_timestamp(n, req);
6221     case NVME_HOST_BEHAVIOR_SUPPORT:
6222         return nvme_c2h(n, (uint8_t *)&n->features.hbs,
6223                         sizeof(n->features.hbs), req);
6224     case NVME_FDP_MODE:
6225         endgrpid = dw11 & 0xff;
6226 
6227         if (endgrpid != 0x1) {
6228             return NVME_INVALID_FIELD | NVME_DNR;
6229         }
6230 
6231         ret = nvme_get_feature_fdp(n, endgrpid, &result);
6232         if (ret) {
6233             return ret;
6234         }
6235         goto out;
6236     case NVME_FDP_EVENTS:
6237         if (!nvme_nsid_valid(n, nsid)) {
6238             return NVME_INVALID_NSID | NVME_DNR;
6239         }
6240 
6241         ns = nvme_ns(n, nsid);
6242         if (unlikely(!ns)) {
6243             return NVME_INVALID_FIELD | NVME_DNR;
6244         }
6245 
6246         ret = nvme_get_feature_fdp_events(n, ns, req, &result);
6247         if (ret) {
6248             return ret;
6249         }
6250         goto out;
6251     default:
6252         break;
6253     }
6254 
6255 defaults:
6256     switch (fid) {
6257     case NVME_TEMPERATURE_THRESHOLD:
6258         result = 0;
6259 
6260         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6261             break;
6262         }
6263 
6264         if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
6265             result = NVME_TEMPERATURE_WARNING;
6266         }
6267 
6268         break;
6269     case NVME_NUMBER_OF_QUEUES:
6270         result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
6271         trace_pci_nvme_getfeat_numq(result);
6272         break;
6273     case NVME_INTERRUPT_VECTOR_CONF:
6274         iv = dw11 & 0xffff;
6275         if (iv >= n->conf_ioqpairs + 1) {
6276             return NVME_INVALID_FIELD | NVME_DNR;
6277         }
6278 
6279         result = iv;
6280         if (iv == n->admin_cq.vector) {
6281             result |= NVME_INTVC_NOCOALESCING;
6282         }
6283         break;
6284     case NVME_FDP_MODE:
6285         endgrpid = dw11 & 0xff;
6286 
6287         if (endgrpid != 0x1) {
6288             return NVME_INVALID_FIELD | NVME_DNR;
6289         }
6290 
6291         ret = nvme_get_feature_fdp(n, endgrpid, &result);
6292         if (ret) {
6293             return ret;
6294         }
6295         goto out;
6296 
6297         break;
6298     default:
6299         result = nvme_feature_default[fid];
6300         break;
6301     }
6302 
6303 out:
6304     req->cqe.result = cpu_to_le32(result);
6305     return ret;
6306 }
6307 
6308 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6309 {
6310     uint16_t ret;
6311     uint64_t timestamp;
6312 
6313     ret = nvme_h2c(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
6314     if (ret) {
6315         return ret;
6316     }
6317 
6318     nvme_set_timestamp(n, timestamp);
6319 
6320     return NVME_SUCCESS;
6321 }
6322 
6323 static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6324                                             NvmeRequest *req)
6325 {
6326     NvmeCmd *cmd = &req->cmd;
6327     uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6328     uint16_t ph = cdw11 & 0xffff;
6329     uint8_t noet = (cdw11 >> 16) & 0xff;
6330     uint16_t ret, ruhid;
6331     uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1;
6332     uint8_t event_mask = 0;
6333     unsigned int i;
6334     g_autofree uint8_t *events = g_malloc0(noet);
6335     NvmeRuHandle *ruh = NULL;
6336 
6337     assert(ns);
6338 
6339     if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6340         return NVME_FDP_DISABLED | NVME_DNR;
6341     }
6342 
6343     if (!nvme_ph_valid(ns, ph)) {
6344         return NVME_INVALID_FIELD | NVME_DNR;
6345     }
6346 
6347     ruhid = ns->fdp.phs[ph];
6348     ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6349 
6350     ret = nvme_h2c(n, events, noet, req);
6351     if (ret) {
6352         return ret;
6353     }
6354 
6355     for (i = 0; i < noet; i++) {
6356         event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]);
6357     }
6358 
6359     if (enable) {
6360         ruh->event_filter |= event_mask;
6361     } else {
6362         ruh->event_filter = ruh->event_filter & ~event_mask;
6363     }
6364 
6365     return NVME_SUCCESS;
6366 }
6367 
6368 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
6369 {
6370     NvmeNamespace *ns = NULL;
6371 
6372     NvmeCmd *cmd = &req->cmd;
6373     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6374     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6375     uint32_t nsid = le32_to_cpu(cmd->nsid);
6376     uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6377     uint8_t save = NVME_SETFEAT_SAVE(dw10);
6378     uint16_t status;
6379     int i;
6380 
6381     trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
6382 
6383     if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
6384         return NVME_FID_NOT_SAVEABLE | NVME_DNR;
6385     }
6386 
6387     if (!nvme_feature_support[fid]) {
6388         return NVME_INVALID_FIELD | NVME_DNR;
6389     }
6390 
6391     if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6392         if (nsid != NVME_NSID_BROADCAST) {
6393             if (!nvme_nsid_valid(n, nsid)) {
6394                 return NVME_INVALID_NSID | NVME_DNR;
6395             }
6396 
6397             ns = nvme_ns(n, nsid);
6398             if (unlikely(!ns)) {
6399                 return NVME_INVALID_FIELD | NVME_DNR;
6400             }
6401         }
6402     } else if (nsid && nsid != NVME_NSID_BROADCAST) {
6403         if (!nvme_nsid_valid(n, nsid)) {
6404             return NVME_INVALID_NSID | NVME_DNR;
6405         }
6406 
6407         return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
6408     }
6409 
6410     if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
6411         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6412     }
6413 
6414     switch (fid) {
6415     case NVME_TEMPERATURE_THRESHOLD:
6416         if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6417             break;
6418         }
6419 
6420         switch (NVME_TEMP_THSEL(dw11)) {
6421         case NVME_TEMP_THSEL_OVER:
6422             n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
6423             break;
6424         case NVME_TEMP_THSEL_UNDER:
6425             n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
6426             break;
6427         default:
6428             return NVME_INVALID_FIELD | NVME_DNR;
6429         }
6430 
6431         if ((n->temperature >= n->features.temp_thresh_hi) ||
6432             (n->temperature <= n->features.temp_thresh_low)) {
6433             nvme_smart_event(n, NVME_SMART_TEMPERATURE);
6434         }
6435 
6436         break;
6437     case NVME_ERROR_RECOVERY:
6438         if (nsid == NVME_NSID_BROADCAST) {
6439             for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6440                 ns = nvme_ns(n, i);
6441 
6442                 if (!ns) {
6443                     continue;
6444                 }
6445 
6446                 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6447                     ns->features.err_rec = dw11;
6448                 }
6449             }
6450 
6451             break;
6452         }
6453 
6454         assert(ns);
6455         if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat))  {
6456             ns->features.err_rec = dw11;
6457         }
6458         break;
6459     case NVME_VOLATILE_WRITE_CACHE:
6460         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6461             ns = nvme_ns(n, i);
6462             if (!ns) {
6463                 continue;
6464             }
6465 
6466             if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
6467                 blk_flush(ns->blkconf.blk);
6468             }
6469 
6470             blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
6471         }
6472 
6473         break;
6474 
6475     case NVME_NUMBER_OF_QUEUES:
6476         if (n->qs_created) {
6477             return NVME_CMD_SEQ_ERROR | NVME_DNR;
6478         }
6479 
6480         /*
6481          * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
6482          * and NSQR.
6483          */
6484         if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
6485             return NVME_INVALID_FIELD | NVME_DNR;
6486         }
6487 
6488         trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
6489                                     ((dw11 >> 16) & 0xffff) + 1,
6490                                     n->conf_ioqpairs,
6491                                     n->conf_ioqpairs);
6492         req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
6493                                       ((n->conf_ioqpairs - 1) << 16));
6494         break;
6495     case NVME_ASYNCHRONOUS_EVENT_CONF:
6496         n->features.async_config = dw11;
6497         break;
6498     case NVME_TIMESTAMP:
6499         return nvme_set_feature_timestamp(n, req);
6500     case NVME_HOST_BEHAVIOR_SUPPORT:
6501         status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
6502                           sizeof(n->features.hbs), req);
6503         if (status) {
6504             return status;
6505         }
6506 
6507         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6508             ns = nvme_ns(n, i);
6509 
6510             if (!ns) {
6511                 continue;
6512             }
6513 
6514             ns->id_ns.nlbaf = ns->nlbaf - 1;
6515             if (!n->features.hbs.lbafee) {
6516                 ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
6517             }
6518         }
6519 
6520         return status;
6521     case NVME_COMMAND_SET_PROFILE:
6522         if (dw11 & 0x1ff) {
6523             trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
6524             return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
6525         }
6526         break;
6527     case NVME_FDP_MODE:
6528         /* spec: abort with cmd seq err if there's one or more NS' in endgrp */
6529         return NVME_CMD_SEQ_ERROR | NVME_DNR;
6530     case NVME_FDP_EVENTS:
6531         return nvme_set_feature_fdp_events(n, ns, req);
6532     default:
6533         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6534     }
6535     return NVME_SUCCESS;
6536 }
6537 
6538 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
6539 {
6540     trace_pci_nvme_aer(nvme_cid(req));
6541 
6542     if (n->outstanding_aers > n->params.aerl) {
6543         trace_pci_nvme_aer_aerl_exceeded();
6544         return NVME_AER_LIMIT_EXCEEDED;
6545     }
6546 
6547     n->aer_reqs[n->outstanding_aers] = req;
6548     n->outstanding_aers++;
6549 
6550     if (!QTAILQ_EMPTY(&n->aer_queue)) {
6551         nvme_process_aers(n);
6552     }
6553 
6554     return NVME_NO_COMPLETE;
6555 }
6556 
6557 static void nvme_update_dmrsl(NvmeCtrl *n)
6558 {
6559     int nsid;
6560 
6561     for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
6562         NvmeNamespace *ns = nvme_ns(n, nsid);
6563         if (!ns) {
6564             continue;
6565         }
6566 
6567         n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6568                                 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6569     }
6570 }
6571 
6572 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
6573 {
6574     uint32_t cc = ldl_le_p(&n->bar.cc);
6575 
6576     ns->iocs = nvme_cse_iocs_none;
6577     switch (ns->csi) {
6578     case NVME_CSI_NVM:
6579         if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
6580             ns->iocs = nvme_cse_iocs_nvm;
6581         }
6582         break;
6583     case NVME_CSI_ZONED:
6584         if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
6585             ns->iocs = nvme_cse_iocs_zoned;
6586         } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
6587             ns->iocs = nvme_cse_iocs_nvm;
6588         }
6589         break;
6590     }
6591 }
6592 
6593 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
6594 {
6595     NvmeNamespace *ns;
6596     NvmeCtrl *ctrl;
6597     uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
6598     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6599     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6600     uint8_t sel = dw10 & 0xf;
6601     uint16_t *nr_ids = &list[0];
6602     uint16_t *ids = &list[1];
6603     uint16_t ret;
6604     int i;
6605 
6606     trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
6607 
6608     if (!nvme_nsid_valid(n, nsid)) {
6609         return NVME_INVALID_NSID | NVME_DNR;
6610     }
6611 
6612     ns = nvme_subsys_ns(n->subsys, nsid);
6613     if (!ns) {
6614         return NVME_INVALID_FIELD | NVME_DNR;
6615     }
6616 
6617     ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
6618     if (ret) {
6619         return ret;
6620     }
6621 
6622     if (!*nr_ids) {
6623         return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6624     }
6625 
6626     *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
6627     for (i = 0; i < *nr_ids; i++) {
6628         ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
6629         if (!ctrl) {
6630             return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6631         }
6632 
6633         switch (sel) {
6634         case NVME_NS_ATTACHMENT_ATTACH:
6635             if (nvme_ns(ctrl, nsid)) {
6636                 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
6637             }
6638 
6639             if (ns->attached && !ns->params.shared) {
6640                 return NVME_NS_PRIVATE | NVME_DNR;
6641             }
6642 
6643             nvme_attach_ns(ctrl, ns);
6644             nvme_select_iocs_ns(ctrl, ns);
6645 
6646             break;
6647 
6648         case NVME_NS_ATTACHMENT_DETACH:
6649             if (!nvme_ns(ctrl, nsid)) {
6650                 return NVME_NS_NOT_ATTACHED | NVME_DNR;
6651             }
6652 
6653             ctrl->namespaces[nsid] = NULL;
6654             ns->attached--;
6655 
6656             nvme_update_dmrsl(ctrl);
6657 
6658             break;
6659 
6660         default:
6661             return NVME_INVALID_FIELD | NVME_DNR;
6662         }
6663 
6664         /*
6665          * Add namespace id to the changed namespace id list for event clearing
6666          * via Get Log Page command.
6667          */
6668         if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
6669             nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
6670                                NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
6671                                NVME_LOG_CHANGED_NSLIST);
6672         }
6673     }
6674 
6675     return NVME_SUCCESS;
6676 }
6677 
6678 typedef struct NvmeFormatAIOCB {
6679     BlockAIOCB common;
6680     BlockAIOCB *aiocb;
6681     NvmeRequest *req;
6682     int ret;
6683 
6684     NvmeNamespace *ns;
6685     uint32_t nsid;
6686     bool broadcast;
6687     int64_t offset;
6688 
6689     uint8_t lbaf;
6690     uint8_t mset;
6691     uint8_t pi;
6692     uint8_t pil;
6693 } NvmeFormatAIOCB;
6694 
6695 static void nvme_format_cancel(BlockAIOCB *aiocb)
6696 {
6697     NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
6698 
6699     iocb->ret = -ECANCELED;
6700 
6701     if (iocb->aiocb) {
6702         blk_aio_cancel_async(iocb->aiocb);
6703         iocb->aiocb = NULL;
6704     }
6705 }
6706 
6707 static const AIOCBInfo nvme_format_aiocb_info = {
6708     .aiocb_size = sizeof(NvmeFormatAIOCB),
6709     .cancel_async = nvme_format_cancel,
6710 };
6711 
6712 static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
6713                             uint8_t pi, uint8_t pil)
6714 {
6715     uint8_t lbafl = lbaf & 0xf;
6716     uint8_t lbafu = lbaf >> 4;
6717 
6718     trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
6719 
6720     ns->id_ns.dps = (pil << 3) | pi;
6721     ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
6722 
6723     nvme_ns_init_format(ns);
6724 }
6725 
6726 static void nvme_do_format(NvmeFormatAIOCB *iocb);
6727 
6728 static void nvme_format_ns_cb(void *opaque, int ret)
6729 {
6730     NvmeFormatAIOCB *iocb = opaque;
6731     NvmeNamespace *ns = iocb->ns;
6732     int bytes;
6733 
6734     if (iocb->ret < 0) {
6735         goto done;
6736     } else if (ret < 0) {
6737         iocb->ret = ret;
6738         goto done;
6739     }
6740 
6741     assert(ns);
6742 
6743     if (iocb->offset < ns->size) {
6744         bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
6745 
6746         iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
6747                                             bytes, BDRV_REQ_MAY_UNMAP,
6748                                             nvme_format_ns_cb, iocb);
6749 
6750         iocb->offset += bytes;
6751         return;
6752     }
6753 
6754     nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
6755     ns->status = 0x0;
6756     iocb->ns = NULL;
6757     iocb->offset = 0;
6758 
6759 done:
6760     nvme_do_format(iocb);
6761 }
6762 
6763 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
6764 {
6765     if (ns->params.zoned) {
6766         return NVME_INVALID_FORMAT | NVME_DNR;
6767     }
6768 
6769     if (lbaf > ns->id_ns.nlbaf) {
6770         return NVME_INVALID_FORMAT | NVME_DNR;
6771     }
6772 
6773     if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
6774         return NVME_INVALID_FORMAT | NVME_DNR;
6775     }
6776 
6777     if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
6778         return NVME_INVALID_FIELD | NVME_DNR;
6779     }
6780 
6781     return NVME_SUCCESS;
6782 }
6783 
6784 static void nvme_do_format(NvmeFormatAIOCB *iocb)
6785 {
6786     NvmeRequest *req = iocb->req;
6787     NvmeCtrl *n = nvme_ctrl(req);
6788     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6789     uint8_t lbaf = dw10 & 0xf;
6790     uint8_t pi = (dw10 >> 5) & 0x7;
6791     uint16_t status;
6792     int i;
6793 
6794     if (iocb->ret < 0) {
6795         goto done;
6796     }
6797 
6798     if (iocb->broadcast) {
6799         for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
6800             iocb->ns = nvme_ns(n, i);
6801             if (iocb->ns) {
6802                 iocb->nsid = i;
6803                 break;
6804             }
6805         }
6806     }
6807 
6808     if (!iocb->ns) {
6809         goto done;
6810     }
6811 
6812     status = nvme_format_check(iocb->ns, lbaf, pi);
6813     if (status) {
6814         req->status = status;
6815         goto done;
6816     }
6817 
6818     iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
6819     nvme_format_ns_cb(iocb, 0);
6820     return;
6821 
6822 done:
6823     iocb->common.cb(iocb->common.opaque, iocb->ret);
6824     qemu_aio_unref(iocb);
6825 }
6826 
6827 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
6828 {
6829     NvmeFormatAIOCB *iocb;
6830     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6831     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6832     uint8_t lbaf = dw10 & 0xf;
6833     uint8_t mset = (dw10 >> 4) & 0x1;
6834     uint8_t pi = (dw10 >> 5) & 0x7;
6835     uint8_t pil = (dw10 >> 8) & 0x1;
6836     uint8_t lbafu = (dw10 >> 12) & 0x3;
6837     uint16_t status;
6838 
6839     iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
6840 
6841     iocb->req = req;
6842     iocb->ret = 0;
6843     iocb->ns = NULL;
6844     iocb->nsid = 0;
6845     iocb->lbaf = lbaf;
6846     iocb->mset = mset;
6847     iocb->pi = pi;
6848     iocb->pil = pil;
6849     iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
6850     iocb->offset = 0;
6851 
6852     if (n->features.hbs.lbafee) {
6853         iocb->lbaf |= lbafu << 4;
6854     }
6855 
6856     if (!iocb->broadcast) {
6857         if (!nvme_nsid_valid(n, nsid)) {
6858             status = NVME_INVALID_NSID | NVME_DNR;
6859             goto out;
6860         }
6861 
6862         iocb->ns = nvme_ns(n, nsid);
6863         if (!iocb->ns) {
6864             status = NVME_INVALID_FIELD | NVME_DNR;
6865             goto out;
6866         }
6867     }
6868 
6869     req->aiocb = &iocb->common;
6870     nvme_do_format(iocb);
6871 
6872     return NVME_NO_COMPLETE;
6873 
6874 out:
6875     qemu_aio_unref(iocb);
6876 
6877     return status;
6878 }
6879 
6880 static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
6881                                   int *num_prim, int *num_sec)
6882 {
6883     *num_total = le32_to_cpu(rt ?
6884                              n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt);
6885     *num_prim = le16_to_cpu(rt ?
6886                             n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap);
6887     *num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
6888 }
6889 
6890 static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
6891                                              uint16_t cntlid, uint8_t rt,
6892                                              int nr)
6893 {
6894     int num_total, num_prim, num_sec;
6895 
6896     if (cntlid != n->cntlid) {
6897         return NVME_INVALID_CTRL_ID | NVME_DNR;
6898     }
6899 
6900     nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
6901 
6902     if (nr > num_total) {
6903         return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
6904     }
6905 
6906     if (nr > num_total - num_sec) {
6907         return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6908     }
6909 
6910     if (rt) {
6911         n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
6912     } else {
6913         n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
6914     }
6915 
6916     req->cqe.result = cpu_to_le32(nr);
6917     return req->status;
6918 }
6919 
6920 static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
6921                                  uint8_t rt, int nr)
6922 {
6923     int prev_nr, prev_total;
6924 
6925     if (rt) {
6926         prev_nr = le16_to_cpu(sctrl->nvi);
6927         prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
6928         sctrl->nvi = cpu_to_le16(nr);
6929         n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
6930     } else {
6931         prev_nr = le16_to_cpu(sctrl->nvq);
6932         prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
6933         sctrl->nvq = cpu_to_le16(nr);
6934         n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
6935     }
6936 }
6937 
6938 static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
6939                                             uint16_t cntlid, uint8_t rt, int nr)
6940 {
6941     int num_total, num_prim, num_sec, num_free, diff, limit;
6942     NvmeSecCtrlEntry *sctrl;
6943 
6944     sctrl = nvme_sctrl_for_cntlid(n, cntlid);
6945     if (!sctrl) {
6946         return NVME_INVALID_CTRL_ID | NVME_DNR;
6947     }
6948 
6949     if (sctrl->scs) {
6950         return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
6951     }
6952 
6953     limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
6954     if (nr > limit) {
6955         return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
6956     }
6957 
6958     nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
6959     num_free = num_total - num_prim - num_sec;
6960     diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
6961 
6962     if (diff > num_free) {
6963         return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6964     }
6965 
6966     nvme_update_virt_res(n, sctrl, rt, nr);
6967     req->cqe.result = cpu_to_le32(nr);
6968 
6969     return req->status;
6970 }
6971 
6972 static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
6973 {
6974     PCIDevice *pci = PCI_DEVICE(n);
6975     NvmeCtrl *sn = NULL;
6976     NvmeSecCtrlEntry *sctrl;
6977     int vf_index;
6978 
6979     sctrl = nvme_sctrl_for_cntlid(n, cntlid);
6980     if (!sctrl) {
6981         return NVME_INVALID_CTRL_ID | NVME_DNR;
6982     }
6983 
6984     if (!pci_is_vf(pci)) {
6985         vf_index = le16_to_cpu(sctrl->vfn) - 1;
6986         sn = NVME(pcie_sriov_get_vf_at_index(pci, vf_index));
6987     }
6988 
6989     if (online) {
6990         if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
6991             return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
6992         }
6993 
6994         if (!sctrl->scs) {
6995             sctrl->scs = 0x1;
6996             nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
6997         }
6998     } else {
6999         nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_INTERRUPT, 0);
7000         nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_QUEUE, 0);
7001 
7002         if (sctrl->scs) {
7003             sctrl->scs = 0x0;
7004             if (sn) {
7005                 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7006             }
7007         }
7008     }
7009 
7010     return NVME_SUCCESS;
7011 }
7012 
7013 static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
7014 {
7015     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7016     uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7017     uint8_t act = dw10 & 0xf;
7018     uint8_t rt = (dw10 >> 8) & 0x7;
7019     uint16_t cntlid = (dw10 >> 16) & 0xffff;
7020     int nr = dw11 & 0xffff;
7021 
7022     trace_pci_nvme_virt_mngmt(nvme_cid(req), act, cntlid, rt ? "VI" : "VQ", nr);
7023 
7024     if (rt != NVME_VIRT_RES_QUEUE && rt != NVME_VIRT_RES_INTERRUPT) {
7025         return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7026     }
7027 
7028     switch (act) {
7029     case NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN:
7030         return nvme_assign_virt_res_to_sec(n, req, cntlid, rt, nr);
7031     case NVME_VIRT_MNGMT_ACTION_PRM_ALLOC:
7032         return nvme_assign_virt_res_to_prim(n, req, cntlid, rt, nr);
7033     case NVME_VIRT_MNGMT_ACTION_SEC_ONLINE:
7034         return nvme_virt_set_state(n, cntlid, true);
7035     case NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE:
7036         return nvme_virt_set_state(n, cntlid, false);
7037     default:
7038         return NVME_INVALID_FIELD | NVME_DNR;
7039     }
7040 }
7041 
7042 static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
7043 {
7044     PCIDevice *pci = PCI_DEVICE(n);
7045     uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
7046     uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
7047     int i;
7048 
7049     /* Address should be page aligned */
7050     if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
7051         return NVME_INVALID_FIELD | NVME_DNR;
7052     }
7053 
7054     /* Save shadow buffer base addr for use during queue creation */
7055     n->dbbuf_dbs = dbs_addr;
7056     n->dbbuf_eis = eis_addr;
7057     n->dbbuf_enabled = true;
7058 
7059     for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7060         NvmeSQueue *sq = n->sq[i];
7061         NvmeCQueue *cq = n->cq[i];
7062 
7063         if (sq) {
7064             /*
7065              * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
7066              * nvme_process_db() uses this hard-coded way to calculate
7067              * doorbell offsets. Be consistent with that here.
7068              */
7069             sq->db_addr = dbs_addr + (i << 3);
7070             sq->ei_addr = eis_addr + (i << 3);
7071             stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
7072 
7073             if (n->params.ioeventfd && sq->sqid != 0) {
7074                 if (!nvme_init_sq_ioeventfd(sq)) {
7075                     sq->ioeventfd_enabled = true;
7076                 }
7077             }
7078         }
7079 
7080         if (cq) {
7081             /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
7082             cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
7083             cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
7084             stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
7085 
7086             if (n->params.ioeventfd && cq->cqid != 0) {
7087                 if (!nvme_init_cq_ioeventfd(cq)) {
7088                     cq->ioeventfd_enabled = true;
7089                 }
7090             }
7091         }
7092     }
7093 
7094     trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr);
7095 
7096     return NVME_SUCCESS;
7097 }
7098 
7099 static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req)
7100 {
7101     return NVME_INVALID_FIELD | NVME_DNR;
7102 }
7103 
7104 static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
7105 {
7106     NvmeNamespace *ns;
7107     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7108     uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7109     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
7110     uint8_t doper, dtype;
7111     uint32_t numd, trans_len;
7112     NvmeDirectiveIdentify id = {
7113         .supported = 1 << NVME_DIRECTIVE_IDENTIFY,
7114         .enabled = 1 << NVME_DIRECTIVE_IDENTIFY,
7115     };
7116 
7117     numd = dw10 + 1;
7118     doper = dw11 & 0xff;
7119     dtype = (dw11 >> 8) & 0xff;
7120 
7121     trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2);
7122 
7123     if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY ||
7124         doper != NVME_DIRECTIVE_RETURN_PARAMS) {
7125         return NVME_INVALID_FIELD | NVME_DNR;
7126     }
7127 
7128     ns = nvme_ns(n, nsid);
7129     if (!ns) {
7130         return NVME_INVALID_FIELD | NVME_DNR;
7131     }
7132 
7133     switch (dtype) {
7134     case NVME_DIRECTIVE_IDENTIFY:
7135         switch (doper) {
7136         case NVME_DIRECTIVE_RETURN_PARAMS:
7137             if (ns->endgrp && ns->endgrp->fdp.enabled) {
7138                 id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7139                 id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7140                 id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7141             }
7142 
7143             return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
7144 
7145         default:
7146             return NVME_INVALID_FIELD | NVME_DNR;
7147         }
7148 
7149     default:
7150         return NVME_INVALID_FIELD;
7151     }
7152 }
7153 
7154 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
7155 {
7156     trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
7157                              nvme_adm_opc_str(req->cmd.opcode));
7158 
7159     if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
7160         trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
7161         return NVME_INVALID_OPCODE | NVME_DNR;
7162     }
7163 
7164     /* SGLs shall not be used for Admin commands in NVMe over PCIe */
7165     if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
7166         return NVME_INVALID_FIELD | NVME_DNR;
7167     }
7168 
7169     if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
7170         return NVME_INVALID_FIELD;
7171     }
7172 
7173     switch (req->cmd.opcode) {
7174     case NVME_ADM_CMD_DELETE_SQ:
7175         return nvme_del_sq(n, req);
7176     case NVME_ADM_CMD_CREATE_SQ:
7177         return nvme_create_sq(n, req);
7178     case NVME_ADM_CMD_GET_LOG_PAGE:
7179         return nvme_get_log(n, req);
7180     case NVME_ADM_CMD_DELETE_CQ:
7181         return nvme_del_cq(n, req);
7182     case NVME_ADM_CMD_CREATE_CQ:
7183         return nvme_create_cq(n, req);
7184     case NVME_ADM_CMD_IDENTIFY:
7185         return nvme_identify(n, req);
7186     case NVME_ADM_CMD_ABORT:
7187         return nvme_abort(n, req);
7188     case NVME_ADM_CMD_SET_FEATURES:
7189         return nvme_set_feature(n, req);
7190     case NVME_ADM_CMD_GET_FEATURES:
7191         return nvme_get_feature(n, req);
7192     case NVME_ADM_CMD_ASYNC_EV_REQ:
7193         return nvme_aer(n, req);
7194     case NVME_ADM_CMD_NS_ATTACHMENT:
7195         return nvme_ns_attachment(n, req);
7196     case NVME_ADM_CMD_VIRT_MNGMT:
7197         return nvme_virt_mngmt(n, req);
7198     case NVME_ADM_CMD_DBBUF_CONFIG:
7199         return nvme_dbbuf_config(n, req);
7200     case NVME_ADM_CMD_FORMAT_NVM:
7201         return nvme_format(n, req);
7202     case NVME_ADM_CMD_DIRECTIVE_SEND:
7203         return nvme_directive_send(n, req);
7204     case NVME_ADM_CMD_DIRECTIVE_RECV:
7205         return nvme_directive_receive(n, req);
7206     default:
7207         assert(false);
7208     }
7209 
7210     return NVME_INVALID_OPCODE | NVME_DNR;
7211 }
7212 
7213 static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
7214 {
7215     trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail);
7216 
7217     stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail,
7218                    MEMTXATTRS_UNSPECIFIED);
7219 }
7220 
7221 static void nvme_update_sq_tail(NvmeSQueue *sq)
7222 {
7223     ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, &sq->tail,
7224                    MEMTXATTRS_UNSPECIFIED);
7225 
7226     trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail);
7227 }
7228 
7229 static void nvme_process_sq(void *opaque)
7230 {
7231     NvmeSQueue *sq = opaque;
7232     NvmeCtrl *n = sq->ctrl;
7233     NvmeCQueue *cq = n->cq[sq->cqid];
7234 
7235     uint16_t status;
7236     hwaddr addr;
7237     NvmeCmd cmd;
7238     NvmeRequest *req;
7239 
7240     if (n->dbbuf_enabled) {
7241         nvme_update_sq_tail(sq);
7242     }
7243 
7244     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
7245         addr = sq->dma_addr + (sq->head << NVME_SQES);
7246         if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
7247             trace_pci_nvme_err_addr_read(addr);
7248             trace_pci_nvme_err_cfs();
7249             stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
7250             break;
7251         }
7252         nvme_inc_sq_head(sq);
7253 
7254         req = QTAILQ_FIRST(&sq->req_list);
7255         QTAILQ_REMOVE(&sq->req_list, req, entry);
7256         QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
7257         nvme_req_clear(req);
7258         req->cqe.cid = cmd.cid;
7259         memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
7260 
7261         status = sq->sqid ? nvme_io_cmd(n, req) :
7262             nvme_admin_cmd(n, req);
7263         if (status != NVME_NO_COMPLETE) {
7264             req->status = status;
7265             nvme_enqueue_req_completion(cq, req);
7266         }
7267 
7268         if (n->dbbuf_enabled) {
7269             nvme_update_sq_eventidx(sq);
7270             nvme_update_sq_tail(sq);
7271         }
7272     }
7273 }
7274 
7275 static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size)
7276 {
7277     uint8_t *config;
7278 
7279     if (!msix_present(pci_dev)) {
7280         return;
7281     }
7282 
7283     assert(table_size > 0 && table_size <= pci_dev->msix_entries_nr);
7284 
7285     config = pci_dev->config + pci_dev->msix_cap;
7286     pci_set_word_by_mask(config + PCI_MSIX_FLAGS, PCI_MSIX_FLAGS_QSIZE,
7287                          table_size - 1);
7288 }
7289 
7290 static void nvme_activate_virt_res(NvmeCtrl *n)
7291 {
7292     PCIDevice *pci_dev = PCI_DEVICE(n);
7293     NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7294     NvmeSecCtrlEntry *sctrl;
7295 
7296     /* -1 to account for the admin queue */
7297     if (pci_is_vf(pci_dev)) {
7298         sctrl = nvme_sctrl(n);
7299         cap->vqprt = sctrl->nvq;
7300         cap->viprt = sctrl->nvi;
7301         n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
7302         n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
7303     } else {
7304         cap->vqrfap = n->next_pri_ctrl_cap.vqrfap;
7305         cap->virfap = n->next_pri_ctrl_cap.virfap;
7306         n->conf_ioqpairs = le16_to_cpu(cap->vqprt) +
7307                            le16_to_cpu(cap->vqrfap) - 1;
7308         n->conf_msix_qsize = le16_to_cpu(cap->viprt) +
7309                              le16_to_cpu(cap->virfap);
7310     }
7311 }
7312 
7313 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
7314 {
7315     PCIDevice *pci_dev = PCI_DEVICE(n);
7316     NvmeSecCtrlEntry *sctrl;
7317     NvmeNamespace *ns;
7318     int i;
7319 
7320     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7321         ns = nvme_ns(n, i);
7322         if (!ns) {
7323             continue;
7324         }
7325 
7326         nvme_ns_drain(ns);
7327     }
7328 
7329     for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7330         if (n->sq[i] != NULL) {
7331             nvme_free_sq(n->sq[i], n);
7332         }
7333     }
7334     for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7335         if (n->cq[i] != NULL) {
7336             nvme_free_cq(n->cq[i], n);
7337         }
7338     }
7339 
7340     while (!QTAILQ_EMPTY(&n->aer_queue)) {
7341         NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
7342         QTAILQ_REMOVE(&n->aer_queue, event, entry);
7343         g_free(event);
7344     }
7345 
7346     if (n->params.sriov_max_vfs) {
7347         if (!pci_is_vf(pci_dev)) {
7348             for (i = 0; i < n->nr_sec_ctrls; i++) {
7349                 sctrl = &n->sec_ctrl_list[i];
7350                 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
7351             }
7352         }
7353 
7354         if (rst != NVME_RESET_CONTROLLER) {
7355             nvme_activate_virt_res(n);
7356         }
7357     }
7358 
7359     n->aer_queued = 0;
7360     n->aer_mask = 0;
7361     n->outstanding_aers = 0;
7362     n->qs_created = false;
7363 
7364     nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
7365 
7366     if (pci_is_vf(pci_dev)) {
7367         sctrl = nvme_sctrl(n);
7368 
7369         stl_le_p(&n->bar.csts, sctrl->scs ? 0 : NVME_CSTS_FAILED);
7370     } else {
7371         stl_le_p(&n->bar.csts, 0);
7372     }
7373 
7374     stl_le_p(&n->bar.intms, 0);
7375     stl_le_p(&n->bar.intmc, 0);
7376     stl_le_p(&n->bar.cc, 0);
7377 
7378     n->dbbuf_dbs = 0;
7379     n->dbbuf_eis = 0;
7380     n->dbbuf_enabled = false;
7381 }
7382 
7383 static void nvme_ctrl_shutdown(NvmeCtrl *n)
7384 {
7385     NvmeNamespace *ns;
7386     int i;
7387 
7388     if (n->pmr.dev) {
7389         memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7390     }
7391 
7392     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7393         ns = nvme_ns(n, i);
7394         if (!ns) {
7395             continue;
7396         }
7397 
7398         nvme_ns_shutdown(ns);
7399     }
7400 }
7401 
7402 static void nvme_select_iocs(NvmeCtrl *n)
7403 {
7404     NvmeNamespace *ns;
7405     int i;
7406 
7407     for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7408         ns = nvme_ns(n, i);
7409         if (!ns) {
7410             continue;
7411         }
7412 
7413         nvme_select_iocs_ns(n, ns);
7414     }
7415 }
7416 
7417 static int nvme_start_ctrl(NvmeCtrl *n)
7418 {
7419     uint64_t cap = ldq_le_p(&n->bar.cap);
7420     uint32_t cc = ldl_le_p(&n->bar.cc);
7421     uint32_t aqa = ldl_le_p(&n->bar.aqa);
7422     uint64_t asq = ldq_le_p(&n->bar.asq);
7423     uint64_t acq = ldq_le_p(&n->bar.acq);
7424     uint32_t page_bits = NVME_CC_MPS(cc) + 12;
7425     uint32_t page_size = 1 << page_bits;
7426     NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
7427 
7428     if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) {
7429         trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
7430                                                 le16_to_cpu(sctrl->nvq));
7431         return -1;
7432     }
7433     if (unlikely(n->cq[0])) {
7434         trace_pci_nvme_err_startfail_cq();
7435         return -1;
7436     }
7437     if (unlikely(n->sq[0])) {
7438         trace_pci_nvme_err_startfail_sq();
7439         return -1;
7440     }
7441     if (unlikely(asq & (page_size - 1))) {
7442         trace_pci_nvme_err_startfail_asq_misaligned(asq);
7443         return -1;
7444     }
7445     if (unlikely(acq & (page_size - 1))) {
7446         trace_pci_nvme_err_startfail_acq_misaligned(acq);
7447         return -1;
7448     }
7449     if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
7450         trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
7451         return -1;
7452     }
7453     if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
7454         trace_pci_nvme_err_startfail_page_too_small(
7455                     NVME_CC_MPS(cc),
7456                     NVME_CAP_MPSMIN(cap));
7457         return -1;
7458     }
7459     if (unlikely(NVME_CC_MPS(cc) >
7460                  NVME_CAP_MPSMAX(cap))) {
7461         trace_pci_nvme_err_startfail_page_too_large(
7462                     NVME_CC_MPS(cc),
7463                     NVME_CAP_MPSMAX(cap));
7464         return -1;
7465     }
7466     if (unlikely(!NVME_AQA_ASQS(aqa))) {
7467         trace_pci_nvme_err_startfail_asqent_sz_zero();
7468         return -1;
7469     }
7470     if (unlikely(!NVME_AQA_ACQS(aqa))) {
7471         trace_pci_nvme_err_startfail_acqent_sz_zero();
7472         return -1;
7473     }
7474 
7475     n->page_bits = page_bits;
7476     n->page_size = page_size;
7477     n->max_prp_ents = n->page_size / sizeof(uint64_t);
7478     nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
7479     nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
7480 
7481     nvme_set_timestamp(n, 0ULL);
7482 
7483     nvme_select_iocs(n);
7484 
7485     return 0;
7486 }
7487 
7488 static void nvme_cmb_enable_regs(NvmeCtrl *n)
7489 {
7490     uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
7491     uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
7492 
7493     NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
7494     NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
7495     NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
7496     stl_le_p(&n->bar.cmbloc, cmbloc);
7497 
7498     NVME_CMBSZ_SET_SQS(cmbsz, 1);
7499     NVME_CMBSZ_SET_CQS(cmbsz, 0);
7500     NVME_CMBSZ_SET_LISTS(cmbsz, 1);
7501     NVME_CMBSZ_SET_RDS(cmbsz, 1);
7502     NVME_CMBSZ_SET_WDS(cmbsz, 1);
7503     NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
7504     NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
7505     stl_le_p(&n->bar.cmbsz, cmbsz);
7506 }
7507 
7508 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
7509                            unsigned size)
7510 {
7511     PCIDevice *pci = PCI_DEVICE(n);
7512     uint64_t cap = ldq_le_p(&n->bar.cap);
7513     uint32_t cc = ldl_le_p(&n->bar.cc);
7514     uint32_t intms = ldl_le_p(&n->bar.intms);
7515     uint32_t csts = ldl_le_p(&n->bar.csts);
7516     uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
7517 
7518     if (unlikely(offset & (sizeof(uint32_t) - 1))) {
7519         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
7520                        "MMIO write not 32-bit aligned,"
7521                        " offset=0x%"PRIx64"", offset);
7522         /* should be ignored, fall through for now */
7523     }
7524 
7525     if (unlikely(size < sizeof(uint32_t))) {
7526         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
7527                        "MMIO write smaller than 32-bits,"
7528                        " offset=0x%"PRIx64", size=%u",
7529                        offset, size);
7530         /* should be ignored, fall through for now */
7531     }
7532 
7533     switch (offset) {
7534     case NVME_REG_INTMS:
7535         if (unlikely(msix_enabled(pci))) {
7536             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7537                            "undefined access to interrupt mask set"
7538                            " when MSI-X is enabled");
7539             /* should be ignored, fall through for now */
7540         }
7541         intms |= data;
7542         stl_le_p(&n->bar.intms, intms);
7543         n->bar.intmc = n->bar.intms;
7544         trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
7545         nvme_irq_check(n);
7546         break;
7547     case NVME_REG_INTMC:
7548         if (unlikely(msix_enabled(pci))) {
7549             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7550                            "undefined access to interrupt mask clr"
7551                            " when MSI-X is enabled");
7552             /* should be ignored, fall through for now */
7553         }
7554         intms &= ~data;
7555         stl_le_p(&n->bar.intms, intms);
7556         n->bar.intmc = n->bar.intms;
7557         trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
7558         nvme_irq_check(n);
7559         break;
7560     case NVME_REG_CC:
7561         stl_le_p(&n->bar.cc, data);
7562 
7563         trace_pci_nvme_mmio_cfg(data & 0xffffffff);
7564 
7565         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
7566             trace_pci_nvme_mmio_shutdown_set();
7567             nvme_ctrl_shutdown(n);
7568             csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7569             csts |= NVME_CSTS_SHST_COMPLETE;
7570         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
7571             trace_pci_nvme_mmio_shutdown_cleared();
7572             csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7573         }
7574 
7575         if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
7576             if (unlikely(nvme_start_ctrl(n))) {
7577                 trace_pci_nvme_err_startfail();
7578                 csts = NVME_CSTS_FAILED;
7579             } else {
7580                 trace_pci_nvme_mmio_start_success();
7581                 csts = NVME_CSTS_READY;
7582             }
7583         } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
7584             trace_pci_nvme_mmio_stopped();
7585             nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
7586 
7587             break;
7588         }
7589 
7590         stl_le_p(&n->bar.csts, csts);
7591 
7592         break;
7593     case NVME_REG_CSTS:
7594         if (data & (1 << 4)) {
7595             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
7596                            "attempted to W1C CSTS.NSSRO"
7597                            " but CAP.NSSRS is zero (not supported)");
7598         } else if (data != 0) {
7599             NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
7600                            "attempted to set a read only bit"
7601                            " of controller status");
7602         }
7603         break;
7604     case NVME_REG_NSSR:
7605         if (data == 0x4e564d65) {
7606             trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
7607         } else {
7608             /* The spec says that writes of other values have no effect */
7609             return;
7610         }
7611         break;
7612     case NVME_REG_AQA:
7613         stl_le_p(&n->bar.aqa, data);
7614         trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
7615         break;
7616     case NVME_REG_ASQ:
7617         stn_le_p(&n->bar.asq, size, data);
7618         trace_pci_nvme_mmio_asqaddr(data);
7619         break;
7620     case NVME_REG_ASQ + 4:
7621         stl_le_p((uint8_t *)&n->bar.asq + 4, data);
7622         trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
7623         break;
7624     case NVME_REG_ACQ:
7625         trace_pci_nvme_mmio_acqaddr(data);
7626         stn_le_p(&n->bar.acq, size, data);
7627         break;
7628     case NVME_REG_ACQ + 4:
7629         stl_le_p((uint8_t *)&n->bar.acq + 4, data);
7630         trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
7631         break;
7632     case NVME_REG_CMBLOC:
7633         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
7634                        "invalid write to reserved CMBLOC"
7635                        " when CMBSZ is zero, ignored");
7636         return;
7637     case NVME_REG_CMBSZ:
7638         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
7639                        "invalid write to read only CMBSZ, ignored");
7640         return;
7641     case NVME_REG_CMBMSC:
7642         if (!NVME_CAP_CMBS(cap)) {
7643             return;
7644         }
7645 
7646         stn_le_p(&n->bar.cmbmsc, size, data);
7647         n->cmb.cmse = false;
7648 
7649         if (NVME_CMBMSC_CRE(data)) {
7650             nvme_cmb_enable_regs(n);
7651 
7652             if (NVME_CMBMSC_CMSE(data)) {
7653                 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
7654                 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
7655                 if (cba + int128_get64(n->cmb.mem.size) < cba) {
7656                     uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
7657                     NVME_CMBSTS_SET_CBAI(cmbsts, 1);
7658                     stl_le_p(&n->bar.cmbsts, cmbsts);
7659                     return;
7660                 }
7661 
7662                 n->cmb.cba = cba;
7663                 n->cmb.cmse = true;
7664             }
7665         } else {
7666             n->bar.cmbsz = 0;
7667             n->bar.cmbloc = 0;
7668         }
7669 
7670         return;
7671     case NVME_REG_CMBMSC + 4:
7672         stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
7673         return;
7674 
7675     case NVME_REG_PMRCAP:
7676         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
7677                        "invalid write to PMRCAP register, ignored");
7678         return;
7679     case NVME_REG_PMRCTL:
7680         if (!NVME_CAP_PMRS(cap)) {
7681             return;
7682         }
7683 
7684         stl_le_p(&n->bar.pmrctl, data);
7685         if (NVME_PMRCTL_EN(data)) {
7686             memory_region_set_enabled(&n->pmr.dev->mr, true);
7687             pmrsts = 0;
7688         } else {
7689             memory_region_set_enabled(&n->pmr.dev->mr, false);
7690             NVME_PMRSTS_SET_NRDY(pmrsts, 1);
7691             n->pmr.cmse = false;
7692         }
7693         stl_le_p(&n->bar.pmrsts, pmrsts);
7694         return;
7695     case NVME_REG_PMRSTS:
7696         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
7697                        "invalid write to PMRSTS register, ignored");
7698         return;
7699     case NVME_REG_PMREBS:
7700         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
7701                        "invalid write to PMREBS register, ignored");
7702         return;
7703     case NVME_REG_PMRSWTP:
7704         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
7705                        "invalid write to PMRSWTP register, ignored");
7706         return;
7707     case NVME_REG_PMRMSCL:
7708         if (!NVME_CAP_PMRS(cap)) {
7709             return;
7710         }
7711 
7712         stl_le_p(&n->bar.pmrmscl, data);
7713         n->pmr.cmse = false;
7714 
7715         if (NVME_PMRMSCL_CMSE(data)) {
7716             uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
7717             hwaddr cba = pmrmscu << 32 |
7718                 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
7719             if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
7720                 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
7721                 stl_le_p(&n->bar.pmrsts, pmrsts);
7722                 return;
7723             }
7724 
7725             n->pmr.cmse = true;
7726             n->pmr.cba = cba;
7727         }
7728 
7729         return;
7730     case NVME_REG_PMRMSCU:
7731         if (!NVME_CAP_PMRS(cap)) {
7732             return;
7733         }
7734 
7735         stl_le_p(&n->bar.pmrmscu, data);
7736         return;
7737     default:
7738         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
7739                        "invalid MMIO write,"
7740                        " offset=0x%"PRIx64", data=%"PRIx64"",
7741                        offset, data);
7742         break;
7743     }
7744 }
7745 
7746 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
7747 {
7748     NvmeCtrl *n = (NvmeCtrl *)opaque;
7749     uint8_t *ptr = (uint8_t *)&n->bar;
7750 
7751     trace_pci_nvme_mmio_read(addr, size);
7752 
7753     if (unlikely(addr & (sizeof(uint32_t) - 1))) {
7754         NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
7755                        "MMIO read not 32-bit aligned,"
7756                        " offset=0x%"PRIx64"", addr);
7757         /* should RAZ, fall through for now */
7758     } else if (unlikely(size < sizeof(uint32_t))) {
7759         NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
7760                        "MMIO read smaller than 32-bits,"
7761                        " offset=0x%"PRIx64"", addr);
7762         /* should RAZ, fall through for now */
7763     }
7764 
7765     if (addr > sizeof(n->bar) - size) {
7766         NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
7767                        "MMIO read beyond last register,"
7768                        " offset=0x%"PRIx64", returning 0", addr);
7769 
7770         return 0;
7771     }
7772 
7773     if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
7774         addr != NVME_REG_CSTS) {
7775         trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
7776         return 0;
7777     }
7778 
7779     /*
7780      * When PMRWBM bit 1 is set then read from
7781      * from PMRSTS should ensure prior writes
7782      * made it to persistent media
7783      */
7784     if (addr == NVME_REG_PMRSTS &&
7785         (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
7786         memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7787     }
7788 
7789     return ldn_le_p(ptr + addr, size);
7790 }
7791 
7792 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
7793 {
7794     PCIDevice *pci = PCI_DEVICE(n);
7795     uint32_t qid;
7796 
7797     if (unlikely(addr & ((1 << 2) - 1))) {
7798         NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
7799                        "doorbell write not 32-bit aligned,"
7800                        " offset=0x%"PRIx64", ignoring", addr);
7801         return;
7802     }
7803 
7804     if (((addr - 0x1000) >> 2) & 1) {
7805         /* Completion queue doorbell write */
7806 
7807         uint16_t new_head = val & 0xffff;
7808         int start_sqs;
7809         NvmeCQueue *cq;
7810 
7811         qid = (addr - (0x1000 + (1 << 2))) >> 3;
7812         if (unlikely(nvme_check_cqid(n, qid))) {
7813             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
7814                            "completion queue doorbell write"
7815                            " for nonexistent queue,"
7816                            " sqid=%"PRIu32", ignoring", qid);
7817 
7818             /*
7819              * NVM Express v1.3d, Section 4.1 state: "If host software writes
7820              * an invalid value to the Submission Queue Tail Doorbell or
7821              * Completion Queue Head Doorbell register and an Asynchronous Event
7822              * Request command is outstanding, then an asynchronous event is
7823              * posted to the Admin Completion Queue with a status code of
7824              * Invalid Doorbell Write Value."
7825              *
7826              * Also note that the spec includes the "Invalid Doorbell Register"
7827              * status code, but nowhere does it specify when to use it.
7828              * However, it seems reasonable to use it here in a similar
7829              * fashion.
7830              */
7831             if (n->outstanding_aers) {
7832                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7833                                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
7834                                    NVME_LOG_ERROR_INFO);
7835             }
7836 
7837             return;
7838         }
7839 
7840         cq = n->cq[qid];
7841         if (unlikely(new_head >= cq->size)) {
7842             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
7843                            "completion queue doorbell write value"
7844                            " beyond queue size, sqid=%"PRIu32","
7845                            " new_head=%"PRIu16", ignoring",
7846                            qid, new_head);
7847 
7848             if (n->outstanding_aers) {
7849                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7850                                    NVME_AER_INFO_ERR_INVALID_DB_VALUE,
7851                                    NVME_LOG_ERROR_INFO);
7852             }
7853 
7854             return;
7855         }
7856 
7857         trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
7858 
7859         start_sqs = nvme_cq_full(cq) ? 1 : 0;
7860         cq->head = new_head;
7861         if (!qid && n->dbbuf_enabled) {
7862             stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
7863         }
7864         if (start_sqs) {
7865             NvmeSQueue *sq;
7866             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
7867                 qemu_bh_schedule(sq->bh);
7868             }
7869             qemu_bh_schedule(cq->bh);
7870         }
7871 
7872         if (cq->tail == cq->head) {
7873             if (cq->irq_enabled) {
7874                 n->cq_pending--;
7875             }
7876 
7877             nvme_irq_deassert(n, cq);
7878         }
7879     } else {
7880         /* Submission queue doorbell write */
7881 
7882         uint16_t new_tail = val & 0xffff;
7883         NvmeSQueue *sq;
7884 
7885         qid = (addr - 0x1000) >> 3;
7886         if (unlikely(nvme_check_sqid(n, qid))) {
7887             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
7888                            "submission queue doorbell write"
7889                            " for nonexistent queue,"
7890                            " sqid=%"PRIu32", ignoring", qid);
7891 
7892             if (n->outstanding_aers) {
7893                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7894                                    NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
7895                                    NVME_LOG_ERROR_INFO);
7896             }
7897 
7898             return;
7899         }
7900 
7901         sq = n->sq[qid];
7902         if (unlikely(new_tail >= sq->size)) {
7903             NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
7904                            "submission queue doorbell write value"
7905                            " beyond queue size, sqid=%"PRIu32","
7906                            " new_tail=%"PRIu16", ignoring",
7907                            qid, new_tail);
7908 
7909             if (n->outstanding_aers) {
7910                 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7911                                    NVME_AER_INFO_ERR_INVALID_DB_VALUE,
7912                                    NVME_LOG_ERROR_INFO);
7913             }
7914 
7915             return;
7916         }
7917 
7918         trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
7919 
7920         sq->tail = new_tail;
7921         if (!qid && n->dbbuf_enabled) {
7922             /*
7923              * The spec states "the host shall also update the controller's
7924              * corresponding doorbell property to match the value of that entry
7925              * in the Shadow Doorbell buffer."
7926              *
7927              * Since this context is currently a VM trap, we can safely enforce
7928              * the requirement from the device side in case the host is
7929              * misbehaving.
7930              *
7931              * Note, we shouldn't have to do this, but various drivers
7932              * including ones that run on Linux, are not updating Admin Queues,
7933              * so we can't trust reading it for an appropriate sq tail.
7934              */
7935             stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
7936         }
7937 
7938         qemu_bh_schedule(sq->bh);
7939     }
7940 }
7941 
7942 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
7943                             unsigned size)
7944 {
7945     NvmeCtrl *n = (NvmeCtrl *)opaque;
7946 
7947     trace_pci_nvme_mmio_write(addr, data, size);
7948 
7949     if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
7950         addr != NVME_REG_CSTS) {
7951         trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
7952         return;
7953     }
7954 
7955     if (addr < sizeof(n->bar)) {
7956         nvme_write_bar(n, addr, data, size);
7957     } else {
7958         nvme_process_db(n, addr, data);
7959     }
7960 }
7961 
7962 static const MemoryRegionOps nvme_mmio_ops = {
7963     .read = nvme_mmio_read,
7964     .write = nvme_mmio_write,
7965     .endianness = DEVICE_LITTLE_ENDIAN,
7966     .impl = {
7967         .min_access_size = 2,
7968         .max_access_size = 8,
7969     },
7970 };
7971 
7972 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
7973                            unsigned size)
7974 {
7975     NvmeCtrl *n = (NvmeCtrl *)opaque;
7976     stn_le_p(&n->cmb.buf[addr], size, data);
7977 }
7978 
7979 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
7980 {
7981     NvmeCtrl *n = (NvmeCtrl *)opaque;
7982     return ldn_le_p(&n->cmb.buf[addr], size);
7983 }
7984 
7985 static const MemoryRegionOps nvme_cmb_ops = {
7986     .read = nvme_cmb_read,
7987     .write = nvme_cmb_write,
7988     .endianness = DEVICE_LITTLE_ENDIAN,
7989     .impl = {
7990         .min_access_size = 1,
7991         .max_access_size = 8,
7992     },
7993 };
7994 
7995 static bool nvme_check_params(NvmeCtrl *n, Error **errp)
7996 {
7997     NvmeParams *params = &n->params;
7998 
7999     if (params->num_queues) {
8000         warn_report("num_queues is deprecated; please use max_ioqpairs "
8001                     "instead");
8002 
8003         params->max_ioqpairs = params->num_queues - 1;
8004     }
8005 
8006     if (n->namespace.blkconf.blk && n->subsys) {
8007         error_setg(errp, "subsystem support is unavailable with legacy "
8008                    "namespace ('drive' property)");
8009         return false;
8010     }
8011 
8012     if (params->max_ioqpairs < 1 ||
8013         params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
8014         error_setg(errp, "max_ioqpairs must be between 1 and %d",
8015                    NVME_MAX_IOQPAIRS);
8016         return false;
8017     }
8018 
8019     if (params->msix_qsize < 1 ||
8020         params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
8021         error_setg(errp, "msix_qsize must be between 1 and %d",
8022                    PCI_MSIX_FLAGS_QSIZE + 1);
8023         return false;
8024     }
8025 
8026     if (!params->serial) {
8027         error_setg(errp, "serial property not set");
8028         return false;
8029     }
8030 
8031     if (params->mqes < 1) {
8032         error_setg(errp, "mqes property cannot be less than 1");
8033         return false;
8034     }
8035 
8036     if (n->pmr.dev) {
8037         if (params->msix_exclusive_bar) {
8038             error_setg(errp, "not enough BARs available to enable PMR");
8039             return false;
8040         }
8041 
8042         if (host_memory_backend_is_mapped(n->pmr.dev)) {
8043             error_setg(errp, "can't use already busy memdev: %s",
8044                        object_get_canonical_path_component(OBJECT(n->pmr.dev)));
8045             return false;
8046         }
8047 
8048         if (!is_power_of_2(n->pmr.dev->size)) {
8049             error_setg(errp, "pmr backend size needs to be power of 2 in size");
8050             return false;
8051         }
8052 
8053         host_memory_backend_set_mapped(n->pmr.dev, true);
8054     }
8055 
8056     if (n->params.zasl > n->params.mdts) {
8057         error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
8058                    "than or equal to mdts (Maximum Data Transfer Size)");
8059         return false;
8060     }
8061 
8062     if (!n->params.vsl) {
8063         error_setg(errp, "vsl must be non-zero");
8064         return false;
8065     }
8066 
8067     if (params->sriov_max_vfs) {
8068         if (!n->subsys) {
8069             error_setg(errp, "subsystem is required for the use of SR-IOV");
8070             return false;
8071         }
8072 
8073         if (params->cmb_size_mb) {
8074             error_setg(errp, "CMB is not supported with SR-IOV");
8075             return false;
8076         }
8077 
8078         if (n->pmr.dev) {
8079             error_setg(errp, "PMR is not supported with SR-IOV");
8080             return false;
8081         }
8082 
8083         if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
8084             error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
8085                        " must be set for the use of SR-IOV");
8086             return false;
8087         }
8088 
8089         if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
8090             error_setg(errp, "sriov_vq_flexible must be greater than or equal"
8091                        " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2);
8092             return false;
8093         }
8094 
8095         if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
8096             error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be"
8097                        " greater than or equal to 2");
8098             return false;
8099         }
8100 
8101         if (params->sriov_vi_flexible < params->sriov_max_vfs) {
8102             error_setg(errp, "sriov_vi_flexible must be greater than or equal"
8103                        " to %d (sriov_max_vfs)", params->sriov_max_vfs);
8104             return false;
8105         }
8106 
8107         if (params->msix_qsize < params->sriov_vi_flexible + 1) {
8108             error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be"
8109                        " greater than or equal to 1");
8110             return false;
8111         }
8112 
8113         if (params->sriov_max_vi_per_vf &&
8114             (params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
8115             error_setg(errp, "sriov_max_vi_per_vf must meet:"
8116                        " (sriov_max_vi_per_vf - 1) %% %d == 0 and"
8117                        " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY);
8118             return false;
8119         }
8120 
8121         if (params->sriov_max_vq_per_vf &&
8122             (params->sriov_max_vq_per_vf < 2 ||
8123              (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
8124             error_setg(errp, "sriov_max_vq_per_vf must meet:"
8125                        " (sriov_max_vq_per_vf - 1) %% %d == 0 and"
8126                        " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY);
8127             return false;
8128         }
8129     }
8130 
8131     return true;
8132 }
8133 
8134 static void nvme_init_state(NvmeCtrl *n)
8135 {
8136     NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8137     NvmeSecCtrlEntry *list = n->sec_ctrl_list;
8138     NvmeSecCtrlEntry *sctrl;
8139     PCIDevice *pci = PCI_DEVICE(n);
8140     uint8_t max_vfs;
8141     int i;
8142 
8143     if (pci_is_vf(pci)) {
8144         sctrl = nvme_sctrl(n);
8145         max_vfs = 0;
8146         n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
8147         n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
8148     } else {
8149         max_vfs = n->params.sriov_max_vfs;
8150         n->conf_ioqpairs = n->params.max_ioqpairs;
8151         n->conf_msix_qsize = n->params.msix_qsize;
8152     }
8153 
8154     n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
8155     n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
8156     n->temperature = NVME_TEMPERATURE;
8157     n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
8158     n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
8159     n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
8160     QTAILQ_INIT(&n->aer_queue);
8161 
8162     n->nr_sec_ctrls = max_vfs;
8163     for (i = 0; i < max_vfs; i++) {
8164         sctrl = &list[i];
8165         sctrl->pcid = cpu_to_le16(n->cntlid);
8166         sctrl->vfn = cpu_to_le16(i + 1);
8167     }
8168 
8169     cap->cntlid = cpu_to_le16(n->cntlid);
8170     cap->crt = NVME_CRT_VQ | NVME_CRT_VI;
8171 
8172     if (pci_is_vf(pci)) {
8173         cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs);
8174     } else {
8175         cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs -
8176                                  n->params.sriov_vq_flexible);
8177         cap->vqfrt = cpu_to_le32(n->params.sriov_vq_flexible);
8178         cap->vqrfap = cap->vqfrt;
8179         cap->vqgran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8180         cap->vqfrsm = n->params.sriov_max_vq_per_vf ?
8181                         cpu_to_le16(n->params.sriov_max_vq_per_vf) :
8182                         cap->vqfrt / MAX(max_vfs, 1);
8183     }
8184 
8185     if (pci_is_vf(pci)) {
8186         cap->viprt = cpu_to_le16(n->conf_msix_qsize);
8187     } else {
8188         cap->viprt = cpu_to_le16(n->params.msix_qsize -
8189                                  n->params.sriov_vi_flexible);
8190         cap->vifrt = cpu_to_le32(n->params.sriov_vi_flexible);
8191         cap->virfap = cap->vifrt;
8192         cap->vigran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8193         cap->vifrsm = n->params.sriov_max_vi_per_vf ?
8194                         cpu_to_le16(n->params.sriov_max_vi_per_vf) :
8195                         cap->vifrt / MAX(max_vfs, 1);
8196     }
8197 }
8198 
8199 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
8200 {
8201     uint64_t cmb_size = n->params.cmb_size_mb * MiB;
8202     uint64_t cap = ldq_le_p(&n->bar.cap);
8203 
8204     n->cmb.buf = g_malloc0(cmb_size);
8205     memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
8206                           "nvme-cmb", cmb_size);
8207     pci_register_bar(pci_dev, NVME_CMB_BIR,
8208                      PCI_BASE_ADDRESS_SPACE_MEMORY |
8209                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
8210                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
8211 
8212     NVME_CAP_SET_CMBS(cap, 1);
8213     stq_le_p(&n->bar.cap, cap);
8214 
8215     if (n->params.legacy_cmb) {
8216         nvme_cmb_enable_regs(n);
8217         n->cmb.cmse = true;
8218     }
8219 }
8220 
8221 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
8222 {
8223     uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
8224 
8225     NVME_PMRCAP_SET_RDS(pmrcap, 1);
8226     NVME_PMRCAP_SET_WDS(pmrcap, 1);
8227     NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
8228     /* Turn on bit 1 support */
8229     NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
8230     NVME_PMRCAP_SET_CMSS(pmrcap, 1);
8231     stl_le_p(&n->bar.pmrcap, pmrcap);
8232 
8233     pci_register_bar(pci_dev, NVME_PMR_BIR,
8234                      PCI_BASE_ADDRESS_SPACE_MEMORY |
8235                      PCI_BASE_ADDRESS_MEM_TYPE_64 |
8236                      PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
8237 
8238     memory_region_set_enabled(&n->pmr.dev->mr, false);
8239 }
8240 
8241 static uint64_t nvme_mbar_size(unsigned total_queues, unsigned total_irqs,
8242                                unsigned *msix_table_offset,
8243                                unsigned *msix_pba_offset)
8244 {
8245     uint64_t bar_size, msix_table_size;
8246 
8247     bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
8248 
8249     if (total_irqs == 0) {
8250         goto out;
8251     }
8252 
8253     bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8254 
8255     if (msix_table_offset) {
8256         *msix_table_offset = bar_size;
8257     }
8258 
8259     msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
8260     bar_size += msix_table_size;
8261     bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8262 
8263     if (msix_pba_offset) {
8264         *msix_pba_offset = bar_size;
8265     }
8266 
8267     bar_size += QEMU_ALIGN_UP(total_irqs, 64) / 8;
8268 
8269 out:
8270     return pow2ceil(bar_size);
8271 }
8272 
8273 static bool nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset,
8274                             Error **errp)
8275 {
8276     uint16_t vf_dev_id = n->params.use_intel_id ?
8277                          PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
8278     NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8279     uint64_t bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm),
8280                                       le16_to_cpu(cap->vifrsm),
8281                                       NULL, NULL);
8282 
8283     if (!pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
8284                             n->params.sriov_max_vfs, n->params.sriov_max_vfs,
8285                             NVME_VF_OFFSET, NVME_VF_STRIDE,
8286                             errp)) {
8287         return false;
8288     }
8289 
8290     pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8291                               PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
8292 
8293     return true;
8294 }
8295 
8296 static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
8297 {
8298     Error *err = NULL;
8299     int ret;
8300 
8301     ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
8302                              PCI_PM_SIZEOF, &err);
8303     if (err) {
8304         error_report_err(err);
8305         return ret;
8306     }
8307 
8308     pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
8309                  PCI_PM_CAP_VER_1_2);
8310     pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
8311                  PCI_PM_CTRL_NO_SOFT_RESET);
8312     pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
8313                  PCI_PM_CTRL_STATE_MASK);
8314 
8315     return 0;
8316 }
8317 
8318 static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
8319 {
8320     ERRP_GUARD();
8321     uint8_t *pci_conf = pci_dev->config;
8322     uint64_t bar_size;
8323     unsigned msix_table_offset = 0, msix_pba_offset = 0;
8324     unsigned nr_vectors;
8325     int ret;
8326 
8327     pci_conf[PCI_INTERRUPT_PIN] = 1;
8328     pci_config_set_prog_interface(pci_conf, 0x2);
8329 
8330     if (n->params.use_intel_id) {
8331         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
8332         pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
8333     } else {
8334         pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
8335         pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
8336     }
8337 
8338     pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
8339     nvme_add_pm_capability(pci_dev, 0x60);
8340     pcie_endpoint_cap_init(pci_dev, 0x80);
8341     pcie_cap_flr_init(pci_dev);
8342     if (n->params.sriov_max_vfs) {
8343         pcie_ari_init(pci_dev, 0x100);
8344     }
8345 
8346     if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) {
8347         bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, 0, NULL, NULL);
8348         memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8349                               bar_size);
8350         pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8351                          PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
8352         ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp);
8353     } else {
8354         assert(n->params.msix_qsize >= 1);
8355 
8356         /* add one to max_ioqpairs to account for the admin queue pair */
8357         if (!pci_is_vf(pci_dev)) {
8358             nr_vectors = n->params.msix_qsize;
8359             bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1,
8360                                       nr_vectors, &msix_table_offset,
8361                                       &msix_pba_offset);
8362         } else {
8363             NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8364             NvmePriCtrlCap *cap = &pn->pri_ctrl_cap;
8365 
8366             nr_vectors = le16_to_cpu(cap->vifrsm);
8367             bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm), nr_vectors,
8368                                       &msix_table_offset, &msix_pba_offset);
8369         }
8370 
8371         memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
8372         memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8373                               msix_table_offset);
8374         memory_region_add_subregion(&n->bar0, 0, &n->iomem);
8375 
8376         if (pci_is_vf(pci_dev)) {
8377             pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
8378         } else {
8379             pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8380                              PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
8381         }
8382 
8383         ret = msix_init(pci_dev, nr_vectors,
8384                         &n->bar0, 0, msix_table_offset,
8385                         &n->bar0, 0, msix_pba_offset, 0, errp);
8386     }
8387 
8388     if (ret == -ENOTSUP) {
8389         /* report that msix is not supported, but do not error out */
8390         warn_report_err(*errp);
8391         *errp = NULL;
8392     } else if (ret < 0) {
8393         /* propagate error to caller */
8394         return false;
8395     }
8396 
8397     if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs &&
8398         !nvme_init_sriov(n, pci_dev, 0x120, errp)) {
8399         msix_uninit(pci_dev, &n->bar0, &n->bar0);
8400         return false;
8401     }
8402 
8403     nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
8404 
8405     if (n->params.cmb_size_mb) {
8406         nvme_init_cmb(n, pci_dev);
8407     }
8408 
8409     if (n->pmr.dev) {
8410         nvme_init_pmr(n, pci_dev);
8411     }
8412 
8413     return true;
8414 }
8415 
8416 static void nvme_init_subnqn(NvmeCtrl *n)
8417 {
8418     NvmeSubsystem *subsys = n->subsys;
8419     NvmeIdCtrl *id = &n->id_ctrl;
8420 
8421     if (!subsys) {
8422         snprintf((char *)id->subnqn, sizeof(id->subnqn),
8423                  "nqn.2019-08.org.qemu:%s", n->params.serial);
8424     } else {
8425         pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
8426     }
8427 }
8428 
8429 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
8430 {
8431     NvmeIdCtrl *id = &n->id_ctrl;
8432     uint8_t *pci_conf = pci_dev->config;
8433     uint64_t cap = ldq_le_p(&n->bar.cap);
8434     NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
8435     uint32_t ctratt;
8436 
8437     id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
8438     id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
8439     strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
8440     strpadcpy((char *)id->fr, sizeof(id->fr), QEMU_VERSION, ' ');
8441     strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
8442 
8443     id->cntlid = cpu_to_le16(n->cntlid);
8444 
8445     id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
8446     ctratt = NVME_CTRATT_ELBAS;
8447 
8448     id->rab = 6;
8449 
8450     if (n->params.use_intel_id) {
8451         id->ieee[0] = 0xb3;
8452         id->ieee[1] = 0x02;
8453         id->ieee[2] = 0x00;
8454     } else {
8455         id->ieee[0] = 0x00;
8456         id->ieee[1] = 0x54;
8457         id->ieee[2] = 0x52;
8458     }
8459 
8460     id->mdts = n->params.mdts;
8461     id->ver = cpu_to_le32(NVME_SPEC_VER);
8462     id->oacs =
8463         cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF |
8464                     NVME_OACS_DIRECTIVES);
8465     id->cntrltype = 0x1;
8466 
8467     /*
8468      * Because the controller always completes the Abort command immediately,
8469      * there can never be more than one concurrently executing Abort command,
8470      * so this value is never used for anything. Note that there can easily be
8471      * many Abort commands in the queues, but they are not considered
8472      * "executing" until processed by nvme_abort.
8473      *
8474      * The specification recommends a value of 3 for Abort Command Limit (four
8475      * concurrently outstanding Abort commands), so lets use that though it is
8476      * inconsequential.
8477      */
8478     id->acl = 3;
8479     id->aerl = n->params.aerl;
8480     id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
8481     id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
8482 
8483     /* recommended default value (~70 C) */
8484     id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
8485     id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
8486 
8487     id->sqes = (NVME_SQES << 4) | NVME_SQES;
8488     id->cqes = (NVME_CQES << 4) | NVME_CQES;
8489     id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
8490     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
8491                            NVME_ONCS_FEATURES | NVME_ONCS_DSM |
8492                            NVME_ONCS_COMPARE | NVME_ONCS_COPY |
8493                            NVME_ONCS_NVMCSA | NVME_ONCS_NVMAFC);
8494 
8495     /*
8496      * NOTE: If this device ever supports a command set that does NOT use 0x0
8497      * as a Flush-equivalent operation, support for the broadcast NSID in Flush
8498      * should probably be removed.
8499      *
8500      * See comment in nvme_io_cmd.
8501      */
8502     id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
8503 
8504     id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1 |
8505                             NVME_OCFS_COPY_FORMAT_2 | NVME_OCFS_COPY_FORMAT_3);
8506     id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN);
8507 
8508     nvme_init_subnqn(n);
8509 
8510     id->psd[0].mp = cpu_to_le16(0x9c4);
8511     id->psd[0].enlat = cpu_to_le32(0x10);
8512     id->psd[0].exlat = cpu_to_le32(0x4);
8513 
8514     if (n->subsys) {
8515         id->cmic |= NVME_CMIC_MULTI_CTRL;
8516         ctratt |= NVME_CTRATT_ENDGRPS;
8517 
8518         id->endgidmax = cpu_to_le16(0x1);
8519 
8520         if (n->subsys->endgrp.fdp.enabled) {
8521             ctratt |= NVME_CTRATT_FDPS;
8522         }
8523     }
8524 
8525     id->ctratt = cpu_to_le32(ctratt);
8526 
8527     NVME_CAP_SET_MQES(cap, n->params.mqes);
8528     NVME_CAP_SET_CQR(cap, 1);
8529     NVME_CAP_SET_TO(cap, 0xf);
8530     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
8531     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
8532     NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
8533     NVME_CAP_SET_MPSMAX(cap, 4);
8534     NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
8535     NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
8536     stq_le_p(&n->bar.cap, cap);
8537 
8538     stl_le_p(&n->bar.vs, NVME_SPEC_VER);
8539     n->bar.intmc = n->bar.intms = 0;
8540 
8541     if (pci_is_vf(pci_dev) && !sctrl->scs) {
8542         stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
8543     }
8544 }
8545 
8546 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
8547 {
8548     int cntlid;
8549 
8550     if (!n->subsys) {
8551         return 0;
8552     }
8553 
8554     cntlid = nvme_subsys_register_ctrl(n, errp);
8555     if (cntlid < 0) {
8556         return -1;
8557     }
8558 
8559     n->cntlid = cntlid;
8560 
8561     return 0;
8562 }
8563 
8564 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
8565 {
8566     uint32_t nsid = ns->params.nsid;
8567     assert(nsid && nsid <= NVME_MAX_NAMESPACES);
8568 
8569     n->namespaces[nsid] = ns;
8570     ns->attached++;
8571 
8572     n->dmrsl = MIN_NON_ZERO(n->dmrsl,
8573                             BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
8574 }
8575 
8576 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
8577 {
8578     NvmeCtrl *n = NVME(pci_dev);
8579     DeviceState *dev = DEVICE(pci_dev);
8580     NvmeNamespace *ns;
8581     NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8582 
8583     if (pci_is_vf(pci_dev)) {
8584         /*
8585          * VFs derive settings from the parent. PF's lifespan exceeds
8586          * that of VF's.
8587          */
8588         memcpy(&n->params, &pn->params, sizeof(NvmeParams));
8589 
8590         /*
8591          * Set PF's serial value to a new string memory to prevent 'serial'
8592          * property object release of PF when a VF is removed from the system.
8593          */
8594         n->params.serial = g_strdup(pn->params.serial);
8595         n->subsys = pn->subsys;
8596     }
8597 
8598     if (!nvme_check_params(n, errp)) {
8599         return;
8600     }
8601 
8602     qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
8603 
8604     if (nvme_init_subsys(n, errp)) {
8605         return;
8606     }
8607     nvme_init_state(n);
8608     if (!nvme_init_pci(n, pci_dev, errp)) {
8609         return;
8610     }
8611     nvme_init_ctrl(n, pci_dev);
8612 
8613     /* setup a namespace if the controller drive property was given */
8614     if (n->namespace.blkconf.blk) {
8615         ns = &n->namespace;
8616         ns->params.nsid = 1;
8617 
8618         if (nvme_ns_setup(ns, errp)) {
8619             return;
8620         }
8621 
8622         nvme_attach_ns(n, ns);
8623     }
8624 }
8625 
8626 static void nvme_exit(PCIDevice *pci_dev)
8627 {
8628     NvmeCtrl *n = NVME(pci_dev);
8629     NvmeNamespace *ns;
8630     int i;
8631 
8632     nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
8633 
8634     if (n->subsys) {
8635         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
8636             ns = nvme_ns(n, i);
8637             if (ns) {
8638                 ns->attached--;
8639             }
8640         }
8641 
8642         nvme_subsys_unregister_ctrl(n->subsys, n);
8643     }
8644 
8645     g_free(n->cq);
8646     g_free(n->sq);
8647     g_free(n->aer_reqs);
8648 
8649     if (n->params.cmb_size_mb) {
8650         g_free(n->cmb.buf);
8651     }
8652 
8653     if (n->pmr.dev) {
8654         host_memory_backend_set_mapped(n->pmr.dev, false);
8655     }
8656 
8657     if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
8658         pcie_sriov_pf_exit(pci_dev);
8659     }
8660 
8661     msix_uninit(pci_dev, &n->bar0, &n->bar0);
8662     memory_region_del_subregion(&n->bar0, &n->iomem);
8663 }
8664 
8665 static Property nvme_props[] = {
8666     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
8667     DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
8668                      HostMemoryBackend *),
8669     DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
8670                      NvmeSubsystem *),
8671     DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
8672     DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
8673     DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
8674     DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
8675     DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
8676     DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
8677     DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
8678     DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
8679     DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
8680     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
8681     DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
8682     DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
8683     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
8684     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
8685                      params.auto_transition_zones, true),
8686     DEFINE_PROP_UINT16("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
8687     DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
8688                        params.sriov_vq_flexible, 0),
8689     DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
8690                        params.sriov_vi_flexible, 0),
8691     DEFINE_PROP_UINT32("sriov_max_vi_per_vf", NvmeCtrl,
8692                        params.sriov_max_vi_per_vf, 0),
8693     DEFINE_PROP_UINT32("sriov_max_vq_per_vf", NvmeCtrl,
8694                        params.sriov_max_vq_per_vf, 0),
8695     DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar,
8696                      false),
8697     DEFINE_PROP_UINT16("mqes", NvmeCtrl, params.mqes, 0x7ff),
8698     DEFINE_PROP_END_OF_LIST(),
8699 };
8700 
8701 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
8702                                    void *opaque, Error **errp)
8703 {
8704     NvmeCtrl *n = NVME(obj);
8705     uint8_t value = n->smart_critical_warning;
8706 
8707     visit_type_uint8(v, name, &value, errp);
8708 }
8709 
8710 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
8711                                    void *opaque, Error **errp)
8712 {
8713     NvmeCtrl *n = NVME(obj);
8714     uint8_t value, old_value, cap = 0, index, event;
8715 
8716     if (!visit_type_uint8(v, name, &value, errp)) {
8717         return;
8718     }
8719 
8720     cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
8721           | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
8722     if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
8723         cap |= NVME_SMART_PMR_UNRELIABLE;
8724     }
8725 
8726     if ((value & cap) != value) {
8727         error_setg(errp, "unsupported smart critical warning bits: 0x%x",
8728                    value & ~cap);
8729         return;
8730     }
8731 
8732     old_value = n->smart_critical_warning;
8733     n->smart_critical_warning = value;
8734 
8735     /* only inject new bits of smart critical warning */
8736     for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
8737         event = 1 << index;
8738         if (value & ~old_value & event)
8739             nvme_smart_event(n, event);
8740     }
8741 }
8742 
8743 static void nvme_pci_reset(DeviceState *qdev)
8744 {
8745     PCIDevice *pci_dev = PCI_DEVICE(qdev);
8746     NvmeCtrl *n = NVME(pci_dev);
8747 
8748     trace_pci_nvme_pci_reset();
8749     nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
8750 }
8751 
8752 static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs)
8753 {
8754     NvmeCtrl *n = NVME(dev);
8755     NvmeSecCtrlEntry *sctrl;
8756     int i;
8757 
8758     for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) {
8759         sctrl = &n->sec_ctrl_list[i];
8760         nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
8761     }
8762 }
8763 
8764 static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
8765                                   uint32_t val, int len)
8766 {
8767     uint16_t old_num_vfs = pcie_sriov_num_vfs(dev);
8768 
8769     pci_default_write_config(dev, address, val, len);
8770     pcie_cap_flr_write_config(dev, address, val, len);
8771     nvme_sriov_post_write_config(dev, old_num_vfs);
8772 }
8773 
8774 static const VMStateDescription nvme_vmstate = {
8775     .name = "nvme",
8776     .unmigratable = 1,
8777 };
8778 
8779 static void nvme_class_init(ObjectClass *oc, void *data)
8780 {
8781     DeviceClass *dc = DEVICE_CLASS(oc);
8782     PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
8783 
8784     pc->realize = nvme_realize;
8785     pc->config_write = nvme_pci_write_config;
8786     pc->exit = nvme_exit;
8787     pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
8788     pc->revision = 2;
8789 
8790     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
8791     dc->desc = "Non-Volatile Memory Express";
8792     device_class_set_props(dc, nvme_props);
8793     dc->vmsd = &nvme_vmstate;
8794     dc->reset = nvme_pci_reset;
8795 }
8796 
8797 static void nvme_instance_init(Object *obj)
8798 {
8799     NvmeCtrl *n = NVME(obj);
8800 
8801     device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
8802                                   "bootindex", "/namespace@1,0",
8803                                   DEVICE(obj));
8804 
8805     object_property_add(obj, "smart_critical_warning", "uint8",
8806                         nvme_get_smart_warning,
8807                         nvme_set_smart_warning, NULL, NULL);
8808 }
8809 
8810 static const TypeInfo nvme_info = {
8811     .name          = TYPE_NVME,
8812     .parent        = TYPE_PCI_DEVICE,
8813     .instance_size = sizeof(NvmeCtrl),
8814     .instance_init = nvme_instance_init,
8815     .class_init    = nvme_class_init,
8816     .interfaces = (InterfaceInfo[]) {
8817         { INTERFACE_PCIE_DEVICE },
8818         { }
8819     },
8820 };
8821 
8822 static const TypeInfo nvme_bus_info = {
8823     .name = TYPE_NVME_BUS,
8824     .parent = TYPE_BUS,
8825     .instance_size = sizeof(NvmeBus),
8826 };
8827 
8828 static void nvme_register_types(void)
8829 {
8830     type_register_static(&nvme_info);
8831     type_register_static(&nvme_bus_info);
8832 }
8833 
8834 type_init(nvme_register_types)
8835