1 /*
2 * QEMU NVM Express Controller
3 *
4 * Copyright (c) 2012, Intel Corporation
5 *
6 * Written by Keith Busch <keith.busch@intel.com>
7 *
8 * This code is licensed under the GNU GPL v2 or later.
9 */
10
11 /**
12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13 *
14 * https://nvmexpress.org/developers/nvme-specification/
15 *
16 *
17 * Notes on coding style
18 * ---------------------
19 * While QEMU coding style prefers lowercase hexadecimals in constants, the
20 * NVMe subsystem use this format from the NVMe specifications in the comments
21 * (i.e. 'h' suffix instead of '0x' prefix).
22 *
23 * Usage
24 * -----
25 * See docs/system/nvme.rst for extensive documentation.
26 *
27 * Add options:
28 * -drive file=<file>,if=none,id=<drive_id>
29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30 * -device nvme,serial=<serial>,id=<bus_name>, \
31 * cmb_size_mb=<cmb_size_mb[optional]>, \
32 * [pmrdev=<mem_backend_file_id>,] \
33 * max_ioqpairs=<N[optional]>, \
34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35 * mdts=<N[optional]>,vsl=<N[optional]>, \
36 * zoned.zasl=<N[optional]>, \
37 * zoned.auto_transition=<on|off[optional]>, \
38 * sriov_max_vfs=<N[optional]> \
39 * sriov_vq_flexible=<N[optional]> \
40 * sriov_vi_flexible=<N[optional]> \
41 * sriov_max_vi_per_vf=<N[optional]> \
42 * sriov_max_vq_per_vf=<N[optional]> \
43 * atomic.dn=<on|off[optional]>, \
44 * atomic.awun<N[optional]>, \
45 * atomic.awupf<N[optional]>, \
46 * subsys=<subsys_id>
47 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
48 * zoned=<true|false[optional]>, \
49 * subsys=<subsys_id>,shared=<true|false[optional]>, \
50 * detached=<true|false[optional]>, \
51 * zoned.zone_size=<N[optional]>, \
52 * zoned.zone_capacity=<N[optional]>, \
53 * zoned.descr_ext_size=<N[optional]>, \
54 * zoned.max_active=<N[optional]>, \
55 * zoned.max_open=<N[optional]>, \
56 * zoned.cross_read=<true|false[optional]>
57 *
58 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
59 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
60 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
61 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
62 *
63 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
64 * For example:
65 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
66 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
67 *
68 * The PMR will use BAR 4/5 exclusively.
69 *
70 * To place controller(s) and namespace(s) to a subsystem, then provide
71 * nvme-subsys device as above.
72 *
73 * nvme subsystem device parameters
74 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
75 * - `nqn`
76 * This parameter provides the `<nqn_id>` part of the string
77 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
78 * of subsystem controllers. Note that `<nqn_id>` should be unique per
79 * subsystem, but this is not enforced by QEMU. If not specified, it will
80 * default to the value of the `id` parameter (`<subsys_id>`).
81 *
82 * nvme device parameters
83 * ~~~~~~~~~~~~~~~~~~~~~~
84 * - `subsys`
85 * Specifying this parameter attaches the controller to the subsystem and
86 * the SUBNQN field in the controller will report the NQN of the subsystem
87 * device. This also enables multi controller capability represented in
88 * Identify Controller data structure in CMIC (Controller Multi-path I/O and
89 * Namespace Sharing Capabilities).
90 *
91 * - `aerl`
92 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number
93 * of concurrently outstanding Asynchronous Event Request commands support
94 * by the controller. This is a 0's based value.
95 *
96 * - `aer_max_queued`
97 * This is the maximum number of events that the device will enqueue for
98 * completion when there are no outstanding AERs. When the maximum number of
99 * enqueued events are reached, subsequent events will be dropped.
100 *
101 * - `mdts`
102 * Indicates the maximum data transfer size for a command that transfers data
103 * between host-accessible memory and the controller. The value is specified
104 * as a power of two (2^n) and is in units of the minimum memory page size
105 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
106 *
107 * - `vsl`
108 * Indicates the maximum data size limit for the Verify command. Like `mdts`,
109 * this value is specified as a power of two (2^n) and is in units of the
110 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
111 * KiB).
112 *
113 * - `zoned.zasl`
114 * Indicates the maximum data transfer size for the Zone Append command. Like
115 * `mdts`, the value is specified as a power of two (2^n) and is in units of
116 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
117 * defaulting to the value of `mdts`).
118 *
119 * - `zoned.auto_transition`
120 * Indicates if zones in zone state implicitly opened can be automatically
121 * transitioned to zone state closed for resource management purposes.
122 * Defaults to 'on'.
123 *
124 * - `sriov_max_vfs`
125 * Indicates the maximum number of PCIe virtual functions supported
126 * by the controller. The default value is 0. Specifying a non-zero value
127 * enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
128 * Virtual function controllers will not report SR-IOV capability.
129 *
130 * NOTE: Single Root I/O Virtualization support is experimental.
131 * All the related parameters may be subject to change.
132 *
133 * - `sriov_vq_flexible`
134 * Indicates the total number of flexible queue resources assignable to all
135 * the secondary controllers. Implicitly sets the number of primary
136 * controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`.
137 *
138 * - `sriov_vi_flexible`
139 * Indicates the total number of flexible interrupt resources assignable to
140 * all the secondary controllers. Implicitly sets the number of primary
141 * controller's private resources to `(msix_qsize - sriov_vi_flexible)`.
142 *
143 * - `sriov_max_vi_per_vf`
144 * Indicates the maximum number of virtual interrupt resources assignable
145 * to a secondary controller. The default 0 resolves to
146 * `(sriov_vi_flexible / sriov_max_vfs)`.
147 *
148 * - `sriov_max_vq_per_vf`
149 * Indicates the maximum number of virtual queue resources assignable to
150 * a secondary controller. The default 0 resolves to
151 * `(sriov_vq_flexible / sriov_max_vfs)`.
152 *
153 * nvme namespace device parameters
154 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
155 * - `shared`
156 * When the parent nvme device (as defined explicitly by the 'bus' parameter
157 * or implicitly by the most recently defined NvmeBus) is linked to an
158 * nvme-subsys device, the namespace will be attached to all controllers in
159 * the subsystem. If set to 'off' (the default), the namespace will remain a
160 * private namespace and may only be attached to a single controller at a
161 * time.
162 *
163 * - `detached`
164 * This parameter is only valid together with the `subsys` parameter. If left
165 * at the default value (`false/off`), the namespace will be attached to all
166 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
167 * namespace will be available in the subsystem but not attached to any
168 * controllers.
169 *
170 * Setting `zoned` to true selects Zoned Command Set at the namespace.
171 * In this case, the following namespace properties are available to configure
172 * zoned operation:
173 * zoned.zone_size=<zone size in bytes, default: 128MiB>
174 * The number may be followed by K, M, G as in kilo-, mega- or giga-.
175 *
176 * zoned.zone_capacity=<zone capacity in bytes, default: zone size>
177 * The value 0 (default) forces zone capacity to be the same as zone
178 * size. The value of this property may not exceed zone size.
179 *
180 * zoned.descr_ext_size=<zone descriptor extension size, default 0>
181 * This value needs to be specified in 64B units. If it is zero,
182 * namespace(s) will not support zone descriptor extensions.
183 *
184 * zoned.max_active=<Maximum Active Resources (zones), default: 0>
185 * The default value means there is no limit to the number of
186 * concurrently active zones.
187 *
188 * zoned.max_open=<Maximum Open Resources (zones), default: 0>
189 * The default value means there is no limit to the number of
190 * concurrently open zones.
191 *
192 * zoned.cross_read=<enable RAZB, default: false>
193 * Setting this property to true enables Read Across Zone Boundaries.
194 */
195
196 #include "qemu/osdep.h"
197 #include "qemu/cutils.h"
198 #include "qemu/error-report.h"
199 #include "qemu/log.h"
200 #include "qemu/units.h"
201 #include "qemu/range.h"
202 #include "qapi/error.h"
203 #include "qapi/visitor.h"
204 #include "sysemu/sysemu.h"
205 #include "sysemu/block-backend.h"
206 #include "sysemu/hostmem.h"
207 #include "hw/pci/msix.h"
208 #include "hw/pci/pcie_sriov.h"
209 #include "sysemu/spdm-socket.h"
210 #include "migration/vmstate.h"
211
212 #include "nvme.h"
213 #include "dif.h"
214 #include "trace.h"
215
216 #define NVME_MAX_IOQPAIRS 0xffff
217 #define NVME_DB_SIZE 4
218 #define NVME_SPEC_VER 0x00010400
219 #define NVME_CMB_BIR 2
220 #define NVME_PMR_BIR 4
221 #define NVME_TEMPERATURE 0x143
222 #define NVME_TEMPERATURE_WARNING 0x157
223 #define NVME_TEMPERATURE_CRITICAL 0x175
224 #define NVME_NUM_FW_SLOTS 1
225 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
226 #define NVME_VF_RES_GRANULARITY 1
227 #define NVME_VF_OFFSET 0x1
228 #define NVME_VF_STRIDE 1
229
230 #define NVME_GUEST_ERR(trace, fmt, ...) \
231 do { \
232 (trace_##trace)(__VA_ARGS__); \
233 qemu_log_mask(LOG_GUEST_ERROR, #trace \
234 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
235 } while (0)
236
237 static const bool nvme_feature_support[NVME_FID_MAX] = {
238 [NVME_ARBITRATION] = true,
239 [NVME_POWER_MANAGEMENT] = true,
240 [NVME_TEMPERATURE_THRESHOLD] = true,
241 [NVME_ERROR_RECOVERY] = true,
242 [NVME_VOLATILE_WRITE_CACHE] = true,
243 [NVME_NUMBER_OF_QUEUES] = true,
244 [NVME_INTERRUPT_COALESCING] = true,
245 [NVME_INTERRUPT_VECTOR_CONF] = true,
246 [NVME_WRITE_ATOMICITY] = true,
247 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
248 [NVME_TIMESTAMP] = true,
249 [NVME_HOST_BEHAVIOR_SUPPORT] = true,
250 [NVME_COMMAND_SET_PROFILE] = true,
251 [NVME_FDP_MODE] = true,
252 [NVME_FDP_EVENTS] = true,
253 };
254
255 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
256 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
257 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
258 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
259 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
260 [NVME_WRITE_ATOMICITY] = NVME_FEAT_CAP_CHANGE,
261 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
262 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
263 [NVME_HOST_BEHAVIOR_SUPPORT] = NVME_FEAT_CAP_CHANGE,
264 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
265 [NVME_FDP_MODE] = NVME_FEAT_CAP_CHANGE,
266 [NVME_FDP_EVENTS] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
267 };
268
269 static const uint32_t nvme_cse_acs[256] = {
270 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
271 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
272 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
273 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
274 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
275 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
276 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
277 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
278 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
279 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
280 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
281 [NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP,
282 [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP,
283 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
284 [NVME_ADM_CMD_DIRECTIVE_RECV] = NVME_CMD_EFF_CSUPP,
285 [NVME_ADM_CMD_DIRECTIVE_SEND] = NVME_CMD_EFF_CSUPP,
286 };
287
288 static const uint32_t nvme_cse_iocs_none[256];
289
290 static const uint32_t nvme_cse_iocs_nvm[256] = {
291 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
292 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
293 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
294 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
295 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
296 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
297 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
298 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
299 [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
300 [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
301 };
302
303 static const uint32_t nvme_cse_iocs_zoned[256] = {
304 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
305 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
306 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
307 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
308 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
309 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
310 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
311 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
312 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
313 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
314 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
315 };
316
317 static void nvme_process_sq(void *opaque);
318 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
319 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n);
320
nvme_sqid(NvmeRequest * req)321 static uint16_t nvme_sqid(NvmeRequest *req)
322 {
323 return le16_to_cpu(req->sq->sqid);
324 }
325
nvme_make_pid(NvmeNamespace * ns,uint16_t rg,uint16_t ph)326 static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg,
327 uint16_t ph)
328 {
329 uint16_t rgif = ns->endgrp->fdp.rgif;
330
331 if (!rgif) {
332 return ph;
333 }
334
335 return (rg << (16 - rgif)) | ph;
336 }
337
nvme_ph_valid(NvmeNamespace * ns,uint16_t ph)338 static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph)
339 {
340 return ph < ns->fdp.nphs;
341 }
342
nvme_rg_valid(NvmeEnduranceGroup * endgrp,uint16_t rg)343 static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg)
344 {
345 return rg < endgrp->fdp.nrg;
346 }
347
nvme_pid2ph(NvmeNamespace * ns,uint16_t pid)348 static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid)
349 {
350 uint16_t rgif = ns->endgrp->fdp.rgif;
351
352 if (!rgif) {
353 return pid;
354 }
355
356 return pid & ((1 << (15 - rgif)) - 1);
357 }
358
nvme_pid2rg(NvmeNamespace * ns,uint16_t pid)359 static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid)
360 {
361 uint16_t rgif = ns->endgrp->fdp.rgif;
362
363 if (!rgif) {
364 return 0;
365 }
366
367 return pid >> (16 - rgif);
368 }
369
nvme_parse_pid(NvmeNamespace * ns,uint16_t pid,uint16_t * ph,uint16_t * rg)370 static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid,
371 uint16_t *ph, uint16_t *rg)
372 {
373 *rg = nvme_pid2rg(ns, pid);
374 *ph = nvme_pid2ph(ns, pid);
375
376 return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg);
377 }
378
nvme_assign_zone_state(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state)379 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
380 NvmeZoneState state)
381 {
382 if (QTAILQ_IN_USE(zone, entry)) {
383 switch (nvme_get_zone_state(zone)) {
384 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
385 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
386 break;
387 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
388 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
389 break;
390 case NVME_ZONE_STATE_CLOSED:
391 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
392 break;
393 case NVME_ZONE_STATE_FULL:
394 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
395 default:
396 ;
397 }
398 }
399
400 nvme_set_zone_state(zone, state);
401
402 switch (state) {
403 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
404 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
405 break;
406 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
407 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
408 break;
409 case NVME_ZONE_STATE_CLOSED:
410 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
411 break;
412 case NVME_ZONE_STATE_FULL:
413 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
414 case NVME_ZONE_STATE_READ_ONLY:
415 break;
416 default:
417 zone->d.za = 0;
418 }
419 }
420
nvme_zns_check_resources(NvmeNamespace * ns,uint32_t act,uint32_t opn,uint32_t zrwa)421 static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
422 uint32_t opn, uint32_t zrwa)
423 {
424 if (ns->params.max_active_zones != 0 &&
425 ns->nr_active_zones + act > ns->params.max_active_zones) {
426 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
427 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
428 }
429
430 if (ns->params.max_open_zones != 0 &&
431 ns->nr_open_zones + opn > ns->params.max_open_zones) {
432 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
433 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
434 }
435
436 if (zrwa > ns->zns.numzrwa) {
437 return NVME_NOZRWA | NVME_DNR;
438 }
439
440 return NVME_SUCCESS;
441 }
442
443 /*
444 * Check if we can open a zone without exceeding open/active limits.
445 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
446 */
nvme_aor_check(NvmeNamespace * ns,uint32_t act,uint32_t opn)447 static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
448 {
449 return nvme_zns_check_resources(ns, act, opn, 0);
450 }
451
nvme_fdp_alloc_event(NvmeCtrl * n,NvmeFdpEventBuffer * ebuf)452 static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf)
453 {
454 NvmeFdpEvent *ret = NULL;
455 bool is_full = ebuf->next == ebuf->start && ebuf->nelems;
456
457 ret = &ebuf->events[ebuf->next++];
458 if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) {
459 ebuf->next = 0;
460 }
461 if (is_full) {
462 ebuf->start = ebuf->next;
463 } else {
464 ebuf->nelems++;
465 }
466
467 memset(ret, 0, sizeof(NvmeFdpEvent));
468 ret->timestamp = nvme_get_timestamp(n);
469
470 return ret;
471 }
472
log_event(NvmeRuHandle * ruh,uint8_t event_type)473 static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type)
474 {
475 return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1;
476 }
477
nvme_update_ruh(NvmeCtrl * n,NvmeNamespace * ns,uint16_t pid)478 static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid)
479 {
480 NvmeEnduranceGroup *endgrp = ns->endgrp;
481 NvmeRuHandle *ruh;
482 NvmeReclaimUnit *ru;
483 NvmeFdpEvent *e = NULL;
484 uint16_t ph, rg, ruhid;
485
486 if (!nvme_parse_pid(ns, pid, &ph, &rg)) {
487 return false;
488 }
489
490 ruhid = ns->fdp.phs[ph];
491
492 ruh = &endgrp->fdp.ruhs[ruhid];
493 ru = &ruh->rus[rg];
494
495 if (ru->ruamw) {
496 if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) {
497 e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events);
498 e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN;
499 e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV;
500 e->pid = cpu_to_le16(pid);
501 e->nsid = cpu_to_le32(ns->params.nsid);
502 e->rgid = cpu_to_le16(rg);
503 e->ruhid = cpu_to_le16(ruhid);
504 }
505
506 /* log (eventual) GC overhead of prematurely swapping the RU */
507 nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw));
508 }
509
510 ru->ruamw = ruh->ruamw;
511
512 return true;
513 }
514
nvme_addr_is_cmb(NvmeCtrl * n,hwaddr addr)515 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
516 {
517 hwaddr hi, lo;
518
519 if (!n->cmb.cmse) {
520 return false;
521 }
522
523 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
524 hi = lo + int128_get64(n->cmb.mem.size);
525
526 return addr >= lo && addr < hi;
527 }
528
nvme_addr_to_cmb(NvmeCtrl * n,hwaddr addr)529 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
530 {
531 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
532 return &n->cmb.buf[addr - base];
533 }
534
nvme_addr_is_pmr(NvmeCtrl * n,hwaddr addr)535 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
536 {
537 hwaddr hi;
538
539 if (!n->pmr.cmse) {
540 return false;
541 }
542
543 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
544
545 return addr >= n->pmr.cba && addr < hi;
546 }
547
nvme_addr_to_pmr(NvmeCtrl * n,hwaddr addr)548 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
549 {
550 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
551 }
552
nvme_addr_is_iomem(NvmeCtrl * n,hwaddr addr)553 static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
554 {
555 hwaddr hi, lo;
556
557 /*
558 * The purpose of this check is to guard against invalid "local" access to
559 * the iomem (i.e. controller registers). Thus, we check against the range
560 * covered by the 'bar0' MemoryRegion since that is currently composed of
561 * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however,
562 * that if the device model is ever changed to allow the CMB to be located
563 * in BAR0 as well, then this must be changed.
564 */
565 lo = n->bar0.addr;
566 hi = lo + int128_get64(n->bar0.size);
567
568 return addr >= lo && addr < hi;
569 }
570
nvme_addr_read(NvmeCtrl * n,hwaddr addr,void * buf,int size)571 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
572 {
573 hwaddr hi = addr + size - 1;
574 if (hi < addr) {
575 return 1;
576 }
577
578 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
579 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
580 return 0;
581 }
582
583 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
584 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
585 return 0;
586 }
587
588 return pci_dma_read(PCI_DEVICE(n), addr, buf, size);
589 }
590
nvme_addr_write(NvmeCtrl * n,hwaddr addr,const void * buf,int size)591 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
592 {
593 hwaddr hi = addr + size - 1;
594 if (hi < addr) {
595 return 1;
596 }
597
598 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
599 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
600 return 0;
601 }
602
603 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
604 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
605 return 0;
606 }
607
608 return pci_dma_write(PCI_DEVICE(n), addr, buf, size);
609 }
610
nvme_nsid_valid(NvmeCtrl * n,uint32_t nsid)611 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
612 {
613 return nsid &&
614 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
615 }
616
nvme_check_sqid(NvmeCtrl * n,uint16_t sqid)617 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
618 {
619 return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
620 }
621
nvme_check_cqid(NvmeCtrl * n,uint16_t cqid)622 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
623 {
624 return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
625 }
626
nvme_inc_cq_tail(NvmeCQueue * cq)627 static void nvme_inc_cq_tail(NvmeCQueue *cq)
628 {
629 cq->tail++;
630 if (cq->tail >= cq->size) {
631 cq->tail = 0;
632 cq->phase = !cq->phase;
633 }
634 }
635
nvme_inc_sq_head(NvmeSQueue * sq)636 static void nvme_inc_sq_head(NvmeSQueue *sq)
637 {
638 sq->head = (sq->head + 1) % sq->size;
639 }
640
nvme_cq_full(NvmeCQueue * cq)641 static uint8_t nvme_cq_full(NvmeCQueue *cq)
642 {
643 return (cq->tail + 1) % cq->size == cq->head;
644 }
645
nvme_sq_empty(NvmeSQueue * sq)646 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
647 {
648 return sq->head == sq->tail;
649 }
650
nvme_irq_check(NvmeCtrl * n)651 static void nvme_irq_check(NvmeCtrl *n)
652 {
653 PCIDevice *pci = PCI_DEVICE(n);
654 uint32_t intms = ldl_le_p(&n->bar.intms);
655
656 if (msix_enabled(pci)) {
657 return;
658 }
659
660 /* vfs does not implement intx */
661 if (pci_is_vf(pci)) {
662 return;
663 }
664
665 if (~intms & n->irq_status) {
666 pci_irq_assert(pci);
667 } else {
668 pci_irq_deassert(pci);
669 }
670 }
671
nvme_irq_assert(NvmeCtrl * n,NvmeCQueue * cq)672 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
673 {
674 PCIDevice *pci = PCI_DEVICE(n);
675
676 if (cq->irq_enabled) {
677 if (msix_enabled(pci)) {
678 trace_pci_nvme_irq_msix(cq->vector);
679 msix_notify(pci, cq->vector);
680 } else {
681 trace_pci_nvme_irq_pin();
682 assert(cq->vector < 32);
683 n->irq_status |= 1 << cq->vector;
684 nvme_irq_check(n);
685 }
686 } else {
687 trace_pci_nvme_irq_masked();
688 }
689 }
690
nvme_irq_deassert(NvmeCtrl * n,NvmeCQueue * cq)691 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
692 {
693 if (cq->irq_enabled) {
694 if (msix_enabled(PCI_DEVICE(n))) {
695 return;
696 } else {
697 assert(cq->vector < 32);
698 if (!n->cq_pending) {
699 n->irq_status &= ~(1 << cq->vector);
700 }
701 nvme_irq_check(n);
702 }
703 }
704 }
705
nvme_req_clear(NvmeRequest * req)706 static void nvme_req_clear(NvmeRequest *req)
707 {
708 req->ns = NULL;
709 req->opaque = NULL;
710 req->aiocb = NULL;
711 memset(&req->cqe, 0x0, sizeof(req->cqe));
712 req->status = NVME_SUCCESS;
713 }
714
nvme_sg_init(NvmeCtrl * n,NvmeSg * sg,bool dma)715 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
716 {
717 if (dma) {
718 pci_dma_sglist_init(&sg->qsg, PCI_DEVICE(n), 0);
719 sg->flags = NVME_SG_DMA;
720 } else {
721 qemu_iovec_init(&sg->iov, 0);
722 }
723
724 sg->flags |= NVME_SG_ALLOC;
725 }
726
nvme_sg_unmap(NvmeSg * sg)727 static inline void nvme_sg_unmap(NvmeSg *sg)
728 {
729 if (!(sg->flags & NVME_SG_ALLOC)) {
730 return;
731 }
732
733 if (sg->flags & NVME_SG_DMA) {
734 qemu_sglist_destroy(&sg->qsg);
735 } else {
736 qemu_iovec_destroy(&sg->iov);
737 }
738
739 memset(sg, 0x0, sizeof(*sg));
740 }
741
742 /*
743 * When metadata is transferred as extended LBAs, the DPTR mapped into `sg`
744 * holds both data and metadata. This function splits the data and metadata
745 * into two separate QSG/IOVs.
746 */
nvme_sg_split(NvmeSg * sg,NvmeNamespace * ns,NvmeSg * data,NvmeSg * mdata)747 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
748 NvmeSg *mdata)
749 {
750 NvmeSg *dst = data;
751 uint32_t trans_len, count = ns->lbasz;
752 uint64_t offset = 0;
753 bool dma = sg->flags & NVME_SG_DMA;
754 size_t sge_len;
755 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
756 int sg_idx = 0;
757
758 assert(sg->flags & NVME_SG_ALLOC);
759
760 while (sg_len) {
761 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
762
763 trans_len = MIN(sg_len, count);
764 trans_len = MIN(trans_len, sge_len - offset);
765
766 if (dst) {
767 if (dma) {
768 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
769 trans_len);
770 } else {
771 qemu_iovec_add(&dst->iov,
772 sg->iov.iov[sg_idx].iov_base + offset,
773 trans_len);
774 }
775 }
776
777 sg_len -= trans_len;
778 count -= trans_len;
779 offset += trans_len;
780
781 if (count == 0) {
782 dst = (dst == data) ? mdata : data;
783 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
784 }
785
786 if (sge_len == offset) {
787 offset = 0;
788 sg_idx++;
789 }
790 }
791 }
792
nvme_map_addr_cmb(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)793 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
794 size_t len)
795 {
796 if (!len) {
797 return NVME_SUCCESS;
798 }
799
800 trace_pci_nvme_map_addr_cmb(addr, len);
801
802 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
803 return NVME_DATA_TRAS_ERROR;
804 }
805
806 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
807
808 return NVME_SUCCESS;
809 }
810
nvme_map_addr_pmr(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)811 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
812 size_t len)
813 {
814 if (!len) {
815 return NVME_SUCCESS;
816 }
817
818 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
819 return NVME_DATA_TRAS_ERROR;
820 }
821
822 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
823
824 return NVME_SUCCESS;
825 }
826
nvme_map_addr(NvmeCtrl * n,NvmeSg * sg,hwaddr addr,size_t len)827 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
828 {
829 bool cmb = false, pmr = false;
830
831 if (!len) {
832 return NVME_SUCCESS;
833 }
834
835 trace_pci_nvme_map_addr(addr, len);
836
837 if (nvme_addr_is_iomem(n, addr)) {
838 return NVME_DATA_TRAS_ERROR;
839 }
840
841 if (nvme_addr_is_cmb(n, addr)) {
842 cmb = true;
843 } else if (nvme_addr_is_pmr(n, addr)) {
844 pmr = true;
845 }
846
847 if (cmb || pmr) {
848 if (sg->flags & NVME_SG_DMA) {
849 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
850 }
851
852 if (sg->iov.niov + 1 > IOV_MAX) {
853 goto max_mappings_exceeded;
854 }
855
856 if (cmb) {
857 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
858 } else {
859 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
860 }
861 }
862
863 if (!(sg->flags & NVME_SG_DMA)) {
864 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
865 }
866
867 if (sg->qsg.nsg + 1 > IOV_MAX) {
868 goto max_mappings_exceeded;
869 }
870
871 qemu_sglist_add(&sg->qsg, addr, len);
872
873 return NVME_SUCCESS;
874
875 max_mappings_exceeded:
876 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
877 "number of mappings exceed 1024");
878 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
879 }
880
nvme_addr_is_dma(NvmeCtrl * n,hwaddr addr)881 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
882 {
883 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
884 }
885
nvme_map_prp(NvmeCtrl * n,NvmeSg * sg,uint64_t prp1,uint64_t prp2,uint32_t len)886 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
887 uint64_t prp2, uint32_t len)
888 {
889 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
890 trans_len = MIN(len, trans_len);
891 int num_prps = (len >> n->page_bits) + 1;
892 uint16_t status;
893 int ret;
894
895 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
896
897 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
898
899 status = nvme_map_addr(n, sg, prp1, trans_len);
900 if (status) {
901 goto unmap;
902 }
903
904 len -= trans_len;
905 if (len) {
906 if (len > n->page_size) {
907 g_autofree uint64_t *prp_list = g_new(uint64_t, n->max_prp_ents);
908 uint32_t nents, prp_trans;
909 int i = 0;
910
911 /*
912 * The first PRP list entry, pointed to by PRP2 may contain offset.
913 * Hence, we need to calculate the number of entries in based on
914 * that offset.
915 */
916 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
917 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
918 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
919 if (ret) {
920 trace_pci_nvme_err_addr_read(prp2);
921 status = NVME_DATA_TRAS_ERROR;
922 goto unmap;
923 }
924 while (len != 0) {
925 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
926
927 if (i == nents - 1 && len > n->page_size) {
928 if (unlikely(prp_ent & (n->page_size - 1))) {
929 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
930 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
931 goto unmap;
932 }
933
934 i = 0;
935 nents = (len + n->page_size - 1) >> n->page_bits;
936 nents = MIN(nents, n->max_prp_ents);
937 prp_trans = nents * sizeof(uint64_t);
938 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
939 prp_trans);
940 if (ret) {
941 trace_pci_nvme_err_addr_read(prp_ent);
942 status = NVME_DATA_TRAS_ERROR;
943 goto unmap;
944 }
945 prp_ent = le64_to_cpu(prp_list[i]);
946 }
947
948 if (unlikely(prp_ent & (n->page_size - 1))) {
949 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
950 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
951 goto unmap;
952 }
953
954 trans_len = MIN(len, n->page_size);
955 status = nvme_map_addr(n, sg, prp_ent, trans_len);
956 if (status) {
957 goto unmap;
958 }
959
960 len -= trans_len;
961 i++;
962 }
963 } else {
964 if (unlikely(prp2 & (n->page_size - 1))) {
965 trace_pci_nvme_err_invalid_prp2_align(prp2);
966 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
967 goto unmap;
968 }
969 status = nvme_map_addr(n, sg, prp2, len);
970 if (status) {
971 goto unmap;
972 }
973 }
974 }
975
976 return NVME_SUCCESS;
977
978 unmap:
979 nvme_sg_unmap(sg);
980 return status;
981 }
982
983 /*
984 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
985 * number of bytes mapped in len.
986 */
nvme_map_sgl_data(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor * segment,uint64_t nsgld,size_t * len,NvmeCmd * cmd)987 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
988 NvmeSglDescriptor *segment, uint64_t nsgld,
989 size_t *len, NvmeCmd *cmd)
990 {
991 dma_addr_t addr, trans_len;
992 uint32_t dlen;
993 uint16_t status;
994
995 for (int i = 0; i < nsgld; i++) {
996 uint8_t type = NVME_SGL_TYPE(segment[i].type);
997
998 switch (type) {
999 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
1000 break;
1001 case NVME_SGL_DESCR_TYPE_SEGMENT:
1002 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1003 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
1004 default:
1005 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
1006 }
1007
1008 dlen = le32_to_cpu(segment[i].len);
1009
1010 if (!dlen) {
1011 continue;
1012 }
1013
1014 if (*len == 0) {
1015 /*
1016 * All data has been mapped, but the SGL contains additional
1017 * segments and/or descriptors. The controller might accept
1018 * ignoring the rest of the SGL.
1019 */
1020 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
1021 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
1022 break;
1023 }
1024
1025 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
1026 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1027 }
1028
1029 trans_len = MIN(*len, dlen);
1030
1031 addr = le64_to_cpu(segment[i].addr);
1032
1033 if (UINT64_MAX - addr < dlen) {
1034 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1035 }
1036
1037 status = nvme_map_addr(n, sg, addr, trans_len);
1038 if (status) {
1039 return status;
1040 }
1041
1042 *len -= trans_len;
1043 }
1044
1045 return NVME_SUCCESS;
1046 }
1047
nvme_map_sgl(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor sgl,size_t len,NvmeCmd * cmd)1048 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
1049 size_t len, NvmeCmd *cmd)
1050 {
1051 /*
1052 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
1053 * dynamically allocating a potentially huge SGL. The spec allows the SGL
1054 * to be larger (as in number of bytes required to describe the SGL
1055 * descriptors and segment chain) than the command transfer size, so it is
1056 * not bounded by MDTS.
1057 */
1058 #define SEG_CHUNK_SIZE 256
1059
1060 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
1061 uint64_t nsgld;
1062 uint32_t seg_len;
1063 uint16_t status;
1064 hwaddr addr;
1065 int ret;
1066
1067 sgld = &sgl;
1068 addr = le64_to_cpu(sgl.addr);
1069
1070 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
1071
1072 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
1073
1074 /*
1075 * If the entire transfer can be described with a single data block it can
1076 * be mapped directly.
1077 */
1078 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1079 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
1080 if (status) {
1081 goto unmap;
1082 }
1083
1084 goto out;
1085 }
1086
1087 for (;;) {
1088 switch (NVME_SGL_TYPE(sgld->type)) {
1089 case NVME_SGL_DESCR_TYPE_SEGMENT:
1090 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1091 break;
1092 default:
1093 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1094 }
1095
1096 seg_len = le32_to_cpu(sgld->len);
1097
1098 /* check the length of the (Last) Segment descriptor */
1099 if (!seg_len || seg_len & 0xf) {
1100 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1101 }
1102
1103 if (UINT64_MAX - addr < seg_len) {
1104 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1105 }
1106
1107 nsgld = seg_len / sizeof(NvmeSglDescriptor);
1108
1109 while (nsgld > SEG_CHUNK_SIZE) {
1110 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
1111 trace_pci_nvme_err_addr_read(addr);
1112 status = NVME_DATA_TRAS_ERROR;
1113 goto unmap;
1114 }
1115
1116 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
1117 &len, cmd);
1118 if (status) {
1119 goto unmap;
1120 }
1121
1122 nsgld -= SEG_CHUNK_SIZE;
1123 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
1124 }
1125
1126 ret = nvme_addr_read(n, addr, segment, nsgld *
1127 sizeof(NvmeSglDescriptor));
1128 if (ret) {
1129 trace_pci_nvme_err_addr_read(addr);
1130 status = NVME_DATA_TRAS_ERROR;
1131 goto unmap;
1132 }
1133
1134 last_sgld = &segment[nsgld - 1];
1135
1136 /*
1137 * If the segment ends with a Data Block, then we are done.
1138 */
1139 if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1140 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
1141 if (status) {
1142 goto unmap;
1143 }
1144
1145 goto out;
1146 }
1147
1148 /*
1149 * If the last descriptor was not a Data Block, then the current
1150 * segment must not be a Last Segment.
1151 */
1152 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
1153 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1154 goto unmap;
1155 }
1156
1157 sgld = last_sgld;
1158 addr = le64_to_cpu(sgld->addr);
1159
1160 /*
1161 * Do not map the last descriptor; it will be a Segment or Last Segment
1162 * descriptor and is handled by the next iteration.
1163 */
1164 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
1165 if (status) {
1166 goto unmap;
1167 }
1168 }
1169
1170 out:
1171 /* if there is any residual left in len, the SGL was too short */
1172 if (len) {
1173 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1174 goto unmap;
1175 }
1176
1177 return NVME_SUCCESS;
1178
1179 unmap:
1180 nvme_sg_unmap(sg);
1181 return status;
1182 }
1183
nvme_map_dptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1184 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1185 NvmeCmd *cmd)
1186 {
1187 uint64_t prp1, prp2;
1188
1189 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
1190 case NVME_PSDT_PRP:
1191 prp1 = le64_to_cpu(cmd->dptr.prp1);
1192 prp2 = le64_to_cpu(cmd->dptr.prp2);
1193
1194 return nvme_map_prp(n, sg, prp1, prp2, len);
1195 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
1196 case NVME_PSDT_SGL_MPTR_SGL:
1197 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
1198 default:
1199 return NVME_INVALID_FIELD;
1200 }
1201 }
1202
nvme_map_mptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1203 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1204 NvmeCmd *cmd)
1205 {
1206 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
1207 hwaddr mptr = le64_to_cpu(cmd->mptr);
1208 uint16_t status;
1209
1210 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1211 NvmeSglDescriptor sgl;
1212
1213 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1214 return NVME_DATA_TRAS_ERROR;
1215 }
1216
1217 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1218 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1219 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1220 }
1221
1222 return status;
1223 }
1224
1225 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1226 status = nvme_map_addr(n, sg, mptr, len);
1227 if (status) {
1228 nvme_sg_unmap(sg);
1229 }
1230
1231 return status;
1232 }
1233
nvme_map_data(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1234 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1235 {
1236 NvmeNamespace *ns = req->ns;
1237 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1238 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1239 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1240 size_t len = nvme_l2b(ns, nlb);
1241 uint16_t status;
1242
1243 if (nvme_ns_ext(ns) &&
1244 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1245 NvmeSg sg;
1246
1247 len += nvme_m2b(ns, nlb);
1248
1249 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1250 if (status) {
1251 return status;
1252 }
1253
1254 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1255 nvme_sg_split(&sg, ns, &req->sg, NULL);
1256 nvme_sg_unmap(&sg);
1257
1258 return NVME_SUCCESS;
1259 }
1260
1261 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1262 }
1263
nvme_map_mdata(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1264 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1265 {
1266 NvmeNamespace *ns = req->ns;
1267 size_t len = nvme_m2b(ns, nlb);
1268 uint16_t status;
1269
1270 if (nvme_ns_ext(ns)) {
1271 NvmeSg sg;
1272
1273 len += nvme_l2b(ns, nlb);
1274
1275 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1276 if (status) {
1277 return status;
1278 }
1279
1280 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1281 nvme_sg_split(&sg, ns, NULL, &req->sg);
1282 nvme_sg_unmap(&sg);
1283
1284 return NVME_SUCCESS;
1285 }
1286
1287 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1288 }
1289
nvme_tx_interleaved(NvmeCtrl * n,NvmeSg * sg,uint8_t * ptr,uint32_t len,uint32_t bytes,int32_t skip_bytes,int64_t offset,NvmeTxDirection dir)1290 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1291 uint32_t len, uint32_t bytes,
1292 int32_t skip_bytes, int64_t offset,
1293 NvmeTxDirection dir)
1294 {
1295 hwaddr addr;
1296 uint32_t trans_len, count = bytes;
1297 bool dma = sg->flags & NVME_SG_DMA;
1298 int64_t sge_len;
1299 int sg_idx = 0;
1300 int ret;
1301
1302 assert(sg->flags & NVME_SG_ALLOC);
1303
1304 while (len) {
1305 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1306
1307 if (sge_len - offset < 0) {
1308 offset -= sge_len;
1309 sg_idx++;
1310 continue;
1311 }
1312
1313 if (sge_len == offset) {
1314 offset = 0;
1315 sg_idx++;
1316 continue;
1317 }
1318
1319 trans_len = MIN(len, count);
1320 trans_len = MIN(trans_len, sge_len - offset);
1321
1322 if (dma) {
1323 addr = sg->qsg.sg[sg_idx].base + offset;
1324 } else {
1325 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1326 }
1327
1328 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1329 ret = nvme_addr_read(n, addr, ptr, trans_len);
1330 } else {
1331 ret = nvme_addr_write(n, addr, ptr, trans_len);
1332 }
1333
1334 if (ret) {
1335 return NVME_DATA_TRAS_ERROR;
1336 }
1337
1338 ptr += trans_len;
1339 len -= trans_len;
1340 count -= trans_len;
1341 offset += trans_len;
1342
1343 if (count == 0) {
1344 count = bytes;
1345 offset += skip_bytes;
1346 }
1347 }
1348
1349 return NVME_SUCCESS;
1350 }
1351
nvme_tx(NvmeCtrl * n,NvmeSg * sg,void * ptr,uint32_t len,NvmeTxDirection dir)1352 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
1353 NvmeTxDirection dir)
1354 {
1355 assert(sg->flags & NVME_SG_ALLOC);
1356
1357 if (sg->flags & NVME_SG_DMA) {
1358 const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1359 dma_addr_t residual;
1360
1361 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1362 dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1363 } else {
1364 dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1365 }
1366
1367 if (unlikely(residual)) {
1368 trace_pci_nvme_err_invalid_dma();
1369 return NVME_INVALID_FIELD | NVME_DNR;
1370 }
1371 } else {
1372 size_t bytes;
1373
1374 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1375 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1376 } else {
1377 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1378 }
1379
1380 if (unlikely(bytes != len)) {
1381 trace_pci_nvme_err_invalid_dma();
1382 return NVME_INVALID_FIELD | NVME_DNR;
1383 }
1384 }
1385
1386 return NVME_SUCCESS;
1387 }
1388
nvme_c2h(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1389 static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
1390 NvmeRequest *req)
1391 {
1392 uint16_t status;
1393
1394 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1395 if (status) {
1396 return status;
1397 }
1398
1399 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1400 }
1401
nvme_h2c(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1402 static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
1403 NvmeRequest *req)
1404 {
1405 uint16_t status;
1406
1407 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1408 if (status) {
1409 return status;
1410 }
1411
1412 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1413 }
1414
nvme_bounce_data(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1415 uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
1416 NvmeTxDirection dir, NvmeRequest *req)
1417 {
1418 NvmeNamespace *ns = req->ns;
1419 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1420 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1421 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1422
1423 if (nvme_ns_ext(ns) &&
1424 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1425 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1426 ns->lbaf.ms, 0, dir);
1427 }
1428
1429 return nvme_tx(n, &req->sg, ptr, len, dir);
1430 }
1431
nvme_bounce_mdata(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1432 uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
1433 NvmeTxDirection dir, NvmeRequest *req)
1434 {
1435 NvmeNamespace *ns = req->ns;
1436 uint16_t status;
1437
1438 if (nvme_ns_ext(ns)) {
1439 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1440 ns->lbasz, ns->lbasz, dir);
1441 }
1442
1443 nvme_sg_unmap(&req->sg);
1444
1445 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1446 if (status) {
1447 return status;
1448 }
1449
1450 return nvme_tx(n, &req->sg, ptr, len, dir);
1451 }
1452
nvme_blk_read(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1453 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1454 uint32_t align, BlockCompletionFunc *cb,
1455 NvmeRequest *req)
1456 {
1457 assert(req->sg.flags & NVME_SG_ALLOC);
1458
1459 if (req->sg.flags & NVME_SG_DMA) {
1460 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req);
1461 } else {
1462 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1463 }
1464 }
1465
nvme_blk_write(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1466 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1467 uint32_t align, BlockCompletionFunc *cb,
1468 NvmeRequest *req)
1469 {
1470 assert(req->sg.flags & NVME_SG_ALLOC);
1471
1472 if (req->sg.flags & NVME_SG_DMA) {
1473 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req);
1474 } else {
1475 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1476 }
1477 }
1478
nvme_update_cq_eventidx(const NvmeCQueue * cq)1479 static void nvme_update_cq_eventidx(const NvmeCQueue *cq)
1480 {
1481 trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head);
1482
1483 stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head,
1484 MEMTXATTRS_UNSPECIFIED);
1485 }
1486
nvme_update_cq_head(NvmeCQueue * cq)1487 static void nvme_update_cq_head(NvmeCQueue *cq)
1488 {
1489 ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, &cq->head,
1490 MEMTXATTRS_UNSPECIFIED);
1491
1492 trace_pci_nvme_update_cq_head(cq->cqid, cq->head);
1493 }
1494
nvme_post_cqes(void * opaque)1495 static void nvme_post_cqes(void *opaque)
1496 {
1497 NvmeCQueue *cq = opaque;
1498 NvmeCtrl *n = cq->ctrl;
1499 NvmeRequest *req, *next;
1500 bool pending = cq->head != cq->tail;
1501 int ret;
1502
1503 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1504 NvmeSQueue *sq;
1505 hwaddr addr;
1506
1507 if (n->dbbuf_enabled) {
1508 nvme_update_cq_eventidx(cq);
1509 nvme_update_cq_head(cq);
1510 }
1511
1512 if (nvme_cq_full(cq)) {
1513 break;
1514 }
1515
1516 sq = req->sq;
1517 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1518 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1519 req->cqe.sq_head = cpu_to_le16(sq->head);
1520 addr = cq->dma_addr + (cq->tail << NVME_CQES);
1521 ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)&req->cqe,
1522 sizeof(req->cqe));
1523 if (ret) {
1524 trace_pci_nvme_err_addr_write(addr);
1525 trace_pci_nvme_err_cfs();
1526 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1527 break;
1528 }
1529
1530 QTAILQ_REMOVE(&cq->req_list, req, entry);
1531
1532 nvme_inc_cq_tail(cq);
1533 nvme_sg_unmap(&req->sg);
1534
1535 if (QTAILQ_EMPTY(&sq->req_list) && !nvme_sq_empty(sq)) {
1536 qemu_bh_schedule(sq->bh);
1537 }
1538
1539 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1540 }
1541 if (cq->tail != cq->head) {
1542 if (cq->irq_enabled && !pending) {
1543 n->cq_pending++;
1544 }
1545
1546 nvme_irq_assert(n, cq);
1547 }
1548 }
1549
nvme_enqueue_req_completion(NvmeCQueue * cq,NvmeRequest * req)1550 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1551 {
1552 assert(cq->cqid == req->sq->cqid);
1553 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1554 le32_to_cpu(req->cqe.result),
1555 le32_to_cpu(req->cqe.dw1),
1556 req->status);
1557
1558 if (req->status) {
1559 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1560 req->status, req->cmd.opcode);
1561 }
1562
1563 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1564 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1565
1566 qemu_bh_schedule(cq->bh);
1567 }
1568
nvme_process_aers(void * opaque)1569 static void nvme_process_aers(void *opaque)
1570 {
1571 NvmeCtrl *n = opaque;
1572 NvmeAsyncEvent *event, *next;
1573
1574 trace_pci_nvme_process_aers(n->aer_queued);
1575
1576 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1577 NvmeRequest *req;
1578 NvmeAerResult *result;
1579
1580 /* can't post cqe if there is nothing to complete */
1581 if (!n->outstanding_aers) {
1582 trace_pci_nvme_no_outstanding_aers();
1583 break;
1584 }
1585
1586 /* ignore if masked (cqe posted, but event not cleared) */
1587 if (n->aer_mask & (1 << event->result.event_type)) {
1588 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1589 continue;
1590 }
1591
1592 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1593 n->aer_queued--;
1594
1595 n->aer_mask |= 1 << event->result.event_type;
1596 n->outstanding_aers--;
1597
1598 req = n->aer_reqs[n->outstanding_aers];
1599
1600 result = (NvmeAerResult *) &req->cqe.result;
1601 result->event_type = event->result.event_type;
1602 result->event_info = event->result.event_info;
1603 result->log_page = event->result.log_page;
1604 g_free(event);
1605
1606 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1607 result->log_page);
1608
1609 nvme_enqueue_req_completion(&n->admin_cq, req);
1610 }
1611 }
1612
nvme_enqueue_event(NvmeCtrl * n,uint8_t event_type,uint8_t event_info,uint8_t log_page)1613 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1614 uint8_t event_info, uint8_t log_page)
1615 {
1616 NvmeAsyncEvent *event;
1617
1618 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1619
1620 if (n->aer_queued == n->params.aer_max_queued) {
1621 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1622 return;
1623 }
1624
1625 event = g_new(NvmeAsyncEvent, 1);
1626 event->result = (NvmeAerResult) {
1627 .event_type = event_type,
1628 .event_info = event_info,
1629 .log_page = log_page,
1630 };
1631
1632 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1633 n->aer_queued++;
1634
1635 nvme_process_aers(n);
1636 }
1637
nvme_smart_event(NvmeCtrl * n,uint8_t event)1638 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1639 {
1640 uint8_t aer_info;
1641
1642 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1643 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1644 return;
1645 }
1646
1647 switch (event) {
1648 case NVME_SMART_SPARE:
1649 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1650 break;
1651 case NVME_SMART_TEMPERATURE:
1652 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1653 break;
1654 case NVME_SMART_RELIABILITY:
1655 case NVME_SMART_MEDIA_READ_ONLY:
1656 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1657 case NVME_SMART_PMR_UNRELIABLE:
1658 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1659 break;
1660 default:
1661 return;
1662 }
1663
1664 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1665 }
1666
nvme_clear_events(NvmeCtrl * n,uint8_t event_type)1667 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1668 {
1669 NvmeAsyncEvent *event, *next;
1670
1671 n->aer_mask &= ~(1 << event_type);
1672
1673 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1674 if (event->result.event_type == event_type) {
1675 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1676 n->aer_queued--;
1677 g_free(event);
1678 }
1679 }
1680 }
1681
nvme_check_mdts(NvmeCtrl * n,size_t len)1682 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1683 {
1684 uint8_t mdts = n->params.mdts;
1685
1686 if (mdts && len > n->page_size << mdts) {
1687 trace_pci_nvme_err_mdts(len);
1688 return NVME_INVALID_FIELD | NVME_DNR;
1689 }
1690
1691 return NVME_SUCCESS;
1692 }
1693
nvme_check_bounds(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1694 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1695 uint32_t nlb)
1696 {
1697 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1698
1699 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1700 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1701 return NVME_LBA_RANGE | NVME_DNR;
1702 }
1703
1704 return NVME_SUCCESS;
1705 }
1706
nvme_block_status_all(NvmeNamespace * ns,uint64_t slba,uint32_t nlb,int flags)1707 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1708 uint32_t nlb, int flags)
1709 {
1710 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1711
1712 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1713 int64_t offset = nvme_l2b(ns, slba);
1714 int ret;
1715
1716 /*
1717 * `pnum` holds the number of bytes after offset that shares the same
1718 * allocation status as the byte at offset. If `pnum` is different from
1719 * `bytes`, we should check the allocation status of the next range and
1720 * continue this until all bytes have been checked.
1721 */
1722 do {
1723 bytes -= pnum;
1724
1725 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1726 if (ret < 0) {
1727 return ret;
1728 }
1729
1730
1731 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1732 !!(ret & BDRV_BLOCK_ZERO));
1733
1734 if (!(ret & flags)) {
1735 return 1;
1736 }
1737
1738 offset += pnum;
1739 } while (pnum != bytes);
1740
1741 return 0;
1742 }
1743
nvme_check_dulbe(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1744 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1745 uint32_t nlb)
1746 {
1747 int ret;
1748 Error *err = NULL;
1749
1750 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1751 if (ret) {
1752 if (ret < 0) {
1753 error_setg_errno(&err, -ret, "unable to get block status");
1754 error_report_err(err);
1755
1756 return NVME_INTERNAL_DEV_ERROR;
1757 }
1758
1759 return NVME_DULB;
1760 }
1761
1762 return NVME_SUCCESS;
1763 }
1764
nvme_aio_err(NvmeRequest * req,int ret)1765 static void nvme_aio_err(NvmeRequest *req, int ret)
1766 {
1767 uint16_t status = NVME_SUCCESS;
1768 Error *local_err = NULL;
1769
1770 switch (req->cmd.opcode) {
1771 case NVME_CMD_READ:
1772 status = NVME_UNRECOVERED_READ;
1773 break;
1774 case NVME_CMD_FLUSH:
1775 case NVME_CMD_WRITE:
1776 case NVME_CMD_WRITE_ZEROES:
1777 case NVME_CMD_ZONE_APPEND:
1778 case NVME_CMD_COPY:
1779 status = NVME_WRITE_FAULT;
1780 break;
1781 default:
1782 status = NVME_INTERNAL_DEV_ERROR;
1783 break;
1784 }
1785
1786 if (ret == -ECANCELED) {
1787 status = NVME_CMD_ABORT_REQ;
1788 }
1789
1790 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1791
1792 error_setg_errno(&local_err, -ret, "aio failed");
1793 error_report_err(local_err);
1794
1795 /*
1796 * Set the command status code to the first encountered error but allow a
1797 * subsequent Internal Device Error to trump it.
1798 */
1799 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1800 return;
1801 }
1802
1803 req->status = status;
1804 }
1805
nvme_zone_idx(NvmeNamespace * ns,uint64_t slba)1806 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1807 {
1808 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1809 slba / ns->zone_size;
1810 }
1811
nvme_get_zone_by_slba(NvmeNamespace * ns,uint64_t slba)1812 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1813 {
1814 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1815
1816 if (zone_idx >= ns->num_zones) {
1817 return NULL;
1818 }
1819
1820 return &ns->zone_array[zone_idx];
1821 }
1822
nvme_check_zone_state_for_write(NvmeZone * zone)1823 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1824 {
1825 uint64_t zslba = zone->d.zslba;
1826
1827 switch (nvme_get_zone_state(zone)) {
1828 case NVME_ZONE_STATE_EMPTY:
1829 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1830 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1831 case NVME_ZONE_STATE_CLOSED:
1832 return NVME_SUCCESS;
1833 case NVME_ZONE_STATE_FULL:
1834 trace_pci_nvme_err_zone_is_full(zslba);
1835 return NVME_ZONE_FULL;
1836 case NVME_ZONE_STATE_OFFLINE:
1837 trace_pci_nvme_err_zone_is_offline(zslba);
1838 return NVME_ZONE_OFFLINE;
1839 case NVME_ZONE_STATE_READ_ONLY:
1840 trace_pci_nvme_err_zone_is_read_only(zslba);
1841 return NVME_ZONE_READ_ONLY;
1842 default:
1843 g_assert_not_reached();
1844 }
1845
1846 return NVME_INTERNAL_DEV_ERROR;
1847 }
1848
nvme_check_zone_write(NvmeNamespace * ns,NvmeZone * zone,uint64_t slba,uint32_t nlb)1849 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1850 uint64_t slba, uint32_t nlb)
1851 {
1852 uint64_t zcap = nvme_zone_wr_boundary(zone);
1853 uint16_t status;
1854
1855 status = nvme_check_zone_state_for_write(zone);
1856 if (status) {
1857 return status;
1858 }
1859
1860 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1861 uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
1862
1863 if (slba < zone->w_ptr || slba + nlb > ezrwa) {
1864 trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
1865 return NVME_ZONE_INVALID_WRITE;
1866 }
1867 } else {
1868 if (unlikely(slba != zone->w_ptr)) {
1869 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
1870 zone->w_ptr);
1871 return NVME_ZONE_INVALID_WRITE;
1872 }
1873 }
1874
1875 if (unlikely((slba + nlb) > zcap)) {
1876 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1877 return NVME_ZONE_BOUNDARY_ERROR;
1878 }
1879
1880 return NVME_SUCCESS;
1881 }
1882
nvme_check_zone_state_for_read(NvmeZone * zone)1883 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1884 {
1885 switch (nvme_get_zone_state(zone)) {
1886 case NVME_ZONE_STATE_EMPTY:
1887 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1888 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1889 case NVME_ZONE_STATE_FULL:
1890 case NVME_ZONE_STATE_CLOSED:
1891 case NVME_ZONE_STATE_READ_ONLY:
1892 return NVME_SUCCESS;
1893 case NVME_ZONE_STATE_OFFLINE:
1894 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1895 return NVME_ZONE_OFFLINE;
1896 default:
1897 g_assert_not_reached();
1898 }
1899
1900 return NVME_INTERNAL_DEV_ERROR;
1901 }
1902
nvme_check_zone_read(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1903 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1904 uint32_t nlb)
1905 {
1906 NvmeZone *zone;
1907 uint64_t bndry, end;
1908 uint16_t status;
1909
1910 zone = nvme_get_zone_by_slba(ns, slba);
1911 assert(zone);
1912
1913 bndry = nvme_zone_rd_boundary(ns, zone);
1914 end = slba + nlb;
1915
1916 status = nvme_check_zone_state_for_read(zone);
1917 if (status) {
1918 ;
1919 } else if (unlikely(end > bndry)) {
1920 if (!ns->params.cross_zone_read) {
1921 status = NVME_ZONE_BOUNDARY_ERROR;
1922 } else {
1923 /*
1924 * Read across zone boundary - check that all subsequent
1925 * zones that are being read have an appropriate state.
1926 */
1927 do {
1928 zone++;
1929 status = nvme_check_zone_state_for_read(zone);
1930 if (status) {
1931 break;
1932 }
1933 } while (end > nvme_zone_rd_boundary(ns, zone));
1934 }
1935 }
1936
1937 return status;
1938 }
1939
nvme_zrm_finish(NvmeNamespace * ns,NvmeZone * zone)1940 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1941 {
1942 switch (nvme_get_zone_state(zone)) {
1943 case NVME_ZONE_STATE_FULL:
1944 return NVME_SUCCESS;
1945
1946 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1947 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1948 nvme_aor_dec_open(ns);
1949 /* fallthrough */
1950 case NVME_ZONE_STATE_CLOSED:
1951 nvme_aor_dec_active(ns);
1952
1953 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1954 zone->d.za &= ~NVME_ZA_ZRWA_VALID;
1955 if (ns->params.numzrwa) {
1956 ns->zns.numzrwa++;
1957 }
1958 }
1959
1960 /* fallthrough */
1961 case NVME_ZONE_STATE_EMPTY:
1962 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1963 return NVME_SUCCESS;
1964
1965 default:
1966 return NVME_ZONE_INVAL_TRANSITION;
1967 }
1968 }
1969
nvme_zrm_close(NvmeNamespace * ns,NvmeZone * zone)1970 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1971 {
1972 switch (nvme_get_zone_state(zone)) {
1973 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1974 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1975 nvme_aor_dec_open(ns);
1976 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1977 /* fall through */
1978 case NVME_ZONE_STATE_CLOSED:
1979 return NVME_SUCCESS;
1980
1981 default:
1982 return NVME_ZONE_INVAL_TRANSITION;
1983 }
1984 }
1985
nvme_zrm_reset(NvmeNamespace * ns,NvmeZone * zone)1986 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1987 {
1988 switch (nvme_get_zone_state(zone)) {
1989 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1990 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1991 nvme_aor_dec_open(ns);
1992 /* fallthrough */
1993 case NVME_ZONE_STATE_CLOSED:
1994 nvme_aor_dec_active(ns);
1995
1996 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1997 if (ns->params.numzrwa) {
1998 ns->zns.numzrwa++;
1999 }
2000 }
2001
2002 /* fallthrough */
2003 case NVME_ZONE_STATE_FULL:
2004 zone->w_ptr = zone->d.zslba;
2005 zone->d.wp = zone->w_ptr;
2006 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
2007 /* fallthrough */
2008 case NVME_ZONE_STATE_EMPTY:
2009 return NVME_SUCCESS;
2010
2011 default:
2012 return NVME_ZONE_INVAL_TRANSITION;
2013 }
2014 }
2015
nvme_zrm_auto_transition_zone(NvmeNamespace * ns)2016 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
2017 {
2018 NvmeZone *zone;
2019
2020 if (ns->params.max_open_zones &&
2021 ns->nr_open_zones == ns->params.max_open_zones) {
2022 zone = QTAILQ_FIRST(&ns->imp_open_zones);
2023 if (zone) {
2024 /*
2025 * Automatically close this implicitly open zone.
2026 */
2027 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
2028 nvme_zrm_close(ns, zone);
2029 }
2030 }
2031 }
2032
2033 enum {
2034 NVME_ZRM_AUTO = 1 << 0,
2035 NVME_ZRM_ZRWA = 1 << 1,
2036 };
2037
nvme_zrm_open_flags(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone,int flags)2038 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
2039 NvmeZone *zone, int flags)
2040 {
2041 int act = 0;
2042 uint16_t status;
2043
2044 switch (nvme_get_zone_state(zone)) {
2045 case NVME_ZONE_STATE_EMPTY:
2046 act = 1;
2047
2048 /* fallthrough */
2049
2050 case NVME_ZONE_STATE_CLOSED:
2051 if (n->params.auto_transition_zones) {
2052 nvme_zrm_auto_transition_zone(ns);
2053 }
2054 status = nvme_zns_check_resources(ns, act, 1,
2055 (flags & NVME_ZRM_ZRWA) ? 1 : 0);
2056 if (status) {
2057 return status;
2058 }
2059
2060 if (act) {
2061 nvme_aor_inc_active(ns);
2062 }
2063
2064 nvme_aor_inc_open(ns);
2065
2066 if (flags & NVME_ZRM_AUTO) {
2067 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
2068 return NVME_SUCCESS;
2069 }
2070
2071 /* fallthrough */
2072
2073 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2074 if (flags & NVME_ZRM_AUTO) {
2075 return NVME_SUCCESS;
2076 }
2077
2078 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
2079
2080 /* fallthrough */
2081
2082 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2083 if (flags & NVME_ZRM_ZRWA) {
2084 ns->zns.numzrwa--;
2085
2086 zone->d.za |= NVME_ZA_ZRWA_VALID;
2087 }
2088
2089 return NVME_SUCCESS;
2090
2091 default:
2092 return NVME_ZONE_INVAL_TRANSITION;
2093 }
2094 }
2095
nvme_zrm_auto(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone)2096 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
2097 NvmeZone *zone)
2098 {
2099 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
2100 }
2101
nvme_advance_zone_wp(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlb)2102 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
2103 uint32_t nlb)
2104 {
2105 zone->d.wp += nlb;
2106
2107 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
2108 nvme_zrm_finish(ns, zone);
2109 }
2110 }
2111
nvme_zoned_zrwa_implicit_flush(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlbc)2112 static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone,
2113 uint32_t nlbc)
2114 {
2115 uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg);
2116
2117 nlbc = nzrwafgs * ns->zns.zrwafg;
2118
2119 trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc);
2120
2121 zone->w_ptr += nlbc;
2122
2123 nvme_advance_zone_wp(ns, zone, nlbc);
2124 }
2125
nvme_finalize_zoned_write(NvmeNamespace * ns,NvmeRequest * req)2126 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
2127 {
2128 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2129 NvmeZone *zone;
2130 uint64_t slba;
2131 uint32_t nlb;
2132
2133 slba = le64_to_cpu(rw->slba);
2134 nlb = le16_to_cpu(rw->nlb) + 1;
2135 zone = nvme_get_zone_by_slba(ns, slba);
2136 assert(zone);
2137
2138 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
2139 uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1;
2140 uint64_t elba = slba + nlb - 1;
2141
2142 if (elba > ezrwa) {
2143 nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa);
2144 }
2145
2146 return;
2147 }
2148
2149 nvme_advance_zone_wp(ns, zone, nlb);
2150 }
2151
nvme_is_write(NvmeRequest * req)2152 static inline bool nvme_is_write(NvmeRequest *req)
2153 {
2154 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2155
2156 return rw->opcode == NVME_CMD_WRITE ||
2157 rw->opcode == NVME_CMD_ZONE_APPEND ||
2158 rw->opcode == NVME_CMD_WRITE_ZEROES;
2159 }
2160
nvme_misc_cb(void * opaque,int ret)2161 static void nvme_misc_cb(void *opaque, int ret)
2162 {
2163 NvmeRequest *req = opaque;
2164
2165 trace_pci_nvme_misc_cb(nvme_cid(req));
2166
2167 if (ret) {
2168 nvme_aio_err(req, ret);
2169 }
2170
2171 nvme_enqueue_req_completion(nvme_cq(req), req);
2172 }
2173
nvme_rw_complete_cb(void * opaque,int ret)2174 void nvme_rw_complete_cb(void *opaque, int ret)
2175 {
2176 NvmeRequest *req = opaque;
2177 NvmeNamespace *ns = req->ns;
2178 BlockBackend *blk = ns->blkconf.blk;
2179 BlockAcctCookie *acct = &req->acct;
2180 BlockAcctStats *stats = blk_get_stats(blk);
2181
2182 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
2183
2184 if (ret) {
2185 block_acct_failed(stats, acct);
2186 nvme_aio_err(req, ret);
2187 } else {
2188 block_acct_done(stats, acct);
2189 }
2190
2191 if (ns->params.zoned && nvme_is_write(req)) {
2192 nvme_finalize_zoned_write(ns, req);
2193 }
2194
2195 nvme_enqueue_req_completion(nvme_cq(req), req);
2196 }
2197
nvme_rw_cb(void * opaque,int ret)2198 static void nvme_rw_cb(void *opaque, int ret)
2199 {
2200 NvmeRequest *req = opaque;
2201 NvmeNamespace *ns = req->ns;
2202
2203 BlockBackend *blk = ns->blkconf.blk;
2204
2205 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
2206
2207 if (ret) {
2208 goto out;
2209 }
2210
2211 if (ns->lbaf.ms) {
2212 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2213 uint64_t slba = le64_to_cpu(rw->slba);
2214 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2215 uint64_t offset = nvme_moff(ns, slba);
2216
2217 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
2218 size_t mlen = nvme_m2b(ns, nlb);
2219
2220 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
2221 BDRV_REQ_MAY_UNMAP,
2222 nvme_rw_complete_cb, req);
2223 return;
2224 }
2225
2226 if (nvme_ns_ext(ns) || req->cmd.mptr) {
2227 uint16_t status;
2228
2229 nvme_sg_unmap(&req->sg);
2230 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
2231 if (status) {
2232 ret = -EFAULT;
2233 goto out;
2234 }
2235
2236 if (req->cmd.opcode == NVME_CMD_READ) {
2237 return nvme_blk_read(blk, offset, 1, nvme_rw_complete_cb, req);
2238 }
2239
2240 return nvme_blk_write(blk, offset, 1, nvme_rw_complete_cb, req);
2241 }
2242 }
2243
2244 out:
2245 nvme_rw_complete_cb(req, ret);
2246 }
2247
nvme_verify_cb(void * opaque,int ret)2248 static void nvme_verify_cb(void *opaque, int ret)
2249 {
2250 NvmeBounceContext *ctx = opaque;
2251 NvmeRequest *req = ctx->req;
2252 NvmeNamespace *ns = req->ns;
2253 BlockBackend *blk = ns->blkconf.blk;
2254 BlockAcctCookie *acct = &req->acct;
2255 BlockAcctStats *stats = blk_get_stats(blk);
2256 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2257 uint64_t slba = le64_to_cpu(rw->slba);
2258 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2259 uint16_t apptag = le16_to_cpu(rw->apptag);
2260 uint16_t appmask = le16_to_cpu(rw->appmask);
2261 uint64_t reftag = le32_to_cpu(rw->reftag);
2262 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2263 uint16_t status;
2264
2265 reftag |= cdw3 << 32;
2266
2267 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
2268
2269 if (ret) {
2270 block_acct_failed(stats, acct);
2271 nvme_aio_err(req, ret);
2272 goto out;
2273 }
2274
2275 block_acct_done(stats, acct);
2276
2277 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2278 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
2279 ctx->mdata.iov.size, slba);
2280 if (status) {
2281 req->status = status;
2282 goto out;
2283 }
2284
2285 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2286 ctx->mdata.bounce, ctx->mdata.iov.size,
2287 prinfo, slba, apptag, appmask, &reftag);
2288 }
2289
2290 out:
2291 qemu_iovec_destroy(&ctx->data.iov);
2292 g_free(ctx->data.bounce);
2293
2294 qemu_iovec_destroy(&ctx->mdata.iov);
2295 g_free(ctx->mdata.bounce);
2296
2297 g_free(ctx);
2298
2299 nvme_enqueue_req_completion(nvme_cq(req), req);
2300 }
2301
2302
nvme_verify_mdata_in_cb(void * opaque,int ret)2303 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2304 {
2305 NvmeBounceContext *ctx = opaque;
2306 NvmeRequest *req = ctx->req;
2307 NvmeNamespace *ns = req->ns;
2308 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2309 uint64_t slba = le64_to_cpu(rw->slba);
2310 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2311 size_t mlen = nvme_m2b(ns, nlb);
2312 uint64_t offset = nvme_moff(ns, slba);
2313 BlockBackend *blk = ns->blkconf.blk;
2314
2315 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2316
2317 if (ret) {
2318 goto out;
2319 }
2320
2321 ctx->mdata.bounce = g_malloc(mlen);
2322
2323 qemu_iovec_reset(&ctx->mdata.iov);
2324 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2325
2326 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2327 nvme_verify_cb, ctx);
2328 return;
2329
2330 out:
2331 nvme_verify_cb(ctx, ret);
2332 }
2333
2334 struct nvme_compare_ctx {
2335 struct {
2336 QEMUIOVector iov;
2337 uint8_t *bounce;
2338 } data;
2339
2340 struct {
2341 QEMUIOVector iov;
2342 uint8_t *bounce;
2343 } mdata;
2344 };
2345
nvme_compare_mdata_cb(void * opaque,int ret)2346 static void nvme_compare_mdata_cb(void *opaque, int ret)
2347 {
2348 NvmeRequest *req = opaque;
2349 NvmeNamespace *ns = req->ns;
2350 NvmeCtrl *n = nvme_ctrl(req);
2351 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2352 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2353 uint16_t apptag = le16_to_cpu(rw->apptag);
2354 uint16_t appmask = le16_to_cpu(rw->appmask);
2355 uint64_t reftag = le32_to_cpu(rw->reftag);
2356 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2357 struct nvme_compare_ctx *ctx = req->opaque;
2358 g_autofree uint8_t *buf = NULL;
2359 BlockBackend *blk = ns->blkconf.blk;
2360 BlockAcctCookie *acct = &req->acct;
2361 BlockAcctStats *stats = blk_get_stats(blk);
2362 uint16_t status = NVME_SUCCESS;
2363
2364 reftag |= cdw3 << 32;
2365
2366 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2367
2368 if (ret) {
2369 block_acct_failed(stats, acct);
2370 nvme_aio_err(req, ret);
2371 goto out;
2372 }
2373
2374 buf = g_malloc(ctx->mdata.iov.size);
2375
2376 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2377 NVME_TX_DIRECTION_TO_DEVICE, req);
2378 if (status) {
2379 req->status = status;
2380 goto out;
2381 }
2382
2383 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2384 uint64_t slba = le64_to_cpu(rw->slba);
2385 uint8_t *bufp;
2386 uint8_t *mbufp = ctx->mdata.bounce;
2387 uint8_t *end = mbufp + ctx->mdata.iov.size;
2388 int16_t pil = 0;
2389
2390 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2391 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2392 slba, apptag, appmask, &reftag);
2393 if (status) {
2394 req->status = status;
2395 goto out;
2396 }
2397
2398 /*
2399 * When formatted with protection information, do not compare the DIF
2400 * tuple.
2401 */
2402 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2403 pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
2404 }
2405
2406 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2407 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2408 req->status = NVME_CMP_FAILURE | NVME_DNR;
2409 goto out;
2410 }
2411 }
2412
2413 goto out;
2414 }
2415
2416 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2417 req->status = NVME_CMP_FAILURE | NVME_DNR;
2418 goto out;
2419 }
2420
2421 block_acct_done(stats, acct);
2422
2423 out:
2424 qemu_iovec_destroy(&ctx->data.iov);
2425 g_free(ctx->data.bounce);
2426
2427 qemu_iovec_destroy(&ctx->mdata.iov);
2428 g_free(ctx->mdata.bounce);
2429
2430 g_free(ctx);
2431
2432 nvme_enqueue_req_completion(nvme_cq(req), req);
2433 }
2434
nvme_compare_data_cb(void * opaque,int ret)2435 static void nvme_compare_data_cb(void *opaque, int ret)
2436 {
2437 NvmeRequest *req = opaque;
2438 NvmeCtrl *n = nvme_ctrl(req);
2439 NvmeNamespace *ns = req->ns;
2440 BlockBackend *blk = ns->blkconf.blk;
2441 BlockAcctCookie *acct = &req->acct;
2442 BlockAcctStats *stats = blk_get_stats(blk);
2443
2444 struct nvme_compare_ctx *ctx = req->opaque;
2445 g_autofree uint8_t *buf = NULL;
2446 uint16_t status;
2447
2448 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2449
2450 if (ret) {
2451 block_acct_failed(stats, acct);
2452 nvme_aio_err(req, ret);
2453 goto out;
2454 }
2455
2456 buf = g_malloc(ctx->data.iov.size);
2457
2458 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2459 NVME_TX_DIRECTION_TO_DEVICE, req);
2460 if (status) {
2461 req->status = status;
2462 goto out;
2463 }
2464
2465 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2466 req->status = NVME_CMP_FAILURE | NVME_DNR;
2467 goto out;
2468 }
2469
2470 if (ns->lbaf.ms) {
2471 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2472 uint64_t slba = le64_to_cpu(rw->slba);
2473 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2474 size_t mlen = nvme_m2b(ns, nlb);
2475 uint64_t offset = nvme_moff(ns, slba);
2476
2477 ctx->mdata.bounce = g_malloc(mlen);
2478
2479 qemu_iovec_init(&ctx->mdata.iov, 1);
2480 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2481
2482 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2483 nvme_compare_mdata_cb, req);
2484 return;
2485 }
2486
2487 block_acct_done(stats, acct);
2488
2489 out:
2490 qemu_iovec_destroy(&ctx->data.iov);
2491 g_free(ctx->data.bounce);
2492 g_free(ctx);
2493
2494 nvme_enqueue_req_completion(nvme_cq(req), req);
2495 }
2496
2497 typedef struct NvmeDSMAIOCB {
2498 BlockAIOCB common;
2499 BlockAIOCB *aiocb;
2500 NvmeRequest *req;
2501 int ret;
2502
2503 NvmeDsmRange *range;
2504 unsigned int nr;
2505 unsigned int idx;
2506 } NvmeDSMAIOCB;
2507
nvme_dsm_cancel(BlockAIOCB * aiocb)2508 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2509 {
2510 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2511
2512 /* break nvme_dsm_cb loop */
2513 iocb->idx = iocb->nr;
2514 iocb->ret = -ECANCELED;
2515
2516 if (iocb->aiocb) {
2517 blk_aio_cancel_async(iocb->aiocb);
2518 iocb->aiocb = NULL;
2519 } else {
2520 /*
2521 * We only reach this if nvme_dsm_cancel() has already been called or
2522 * the command ran to completion.
2523 */
2524 assert(iocb->idx == iocb->nr);
2525 }
2526 }
2527
2528 static const AIOCBInfo nvme_dsm_aiocb_info = {
2529 .aiocb_size = sizeof(NvmeDSMAIOCB),
2530 .cancel_async = nvme_dsm_cancel,
2531 };
2532
2533 static void nvme_dsm_cb(void *opaque, int ret);
2534
nvme_dsm_md_cb(void * opaque,int ret)2535 static void nvme_dsm_md_cb(void *opaque, int ret)
2536 {
2537 NvmeDSMAIOCB *iocb = opaque;
2538 NvmeRequest *req = iocb->req;
2539 NvmeNamespace *ns = req->ns;
2540 NvmeDsmRange *range;
2541 uint64_t slba;
2542 uint32_t nlb;
2543
2544 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
2545 goto done;
2546 }
2547
2548 range = &iocb->range[iocb->idx - 1];
2549 slba = le64_to_cpu(range->slba);
2550 nlb = le32_to_cpu(range->nlb);
2551
2552 /*
2553 * Check that all block were discarded (zeroed); otherwise we do not zero
2554 * the metadata.
2555 */
2556
2557 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2558 if (ret) {
2559 if (ret < 0) {
2560 goto done;
2561 }
2562
2563 nvme_dsm_cb(iocb, 0);
2564 return;
2565 }
2566
2567 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2568 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2569 nvme_dsm_cb, iocb);
2570 return;
2571
2572 done:
2573 nvme_dsm_cb(iocb, ret);
2574 }
2575
nvme_dsm_cb(void * opaque,int ret)2576 static void nvme_dsm_cb(void *opaque, int ret)
2577 {
2578 NvmeDSMAIOCB *iocb = opaque;
2579 NvmeRequest *req = iocb->req;
2580 NvmeCtrl *n = nvme_ctrl(req);
2581 NvmeNamespace *ns = req->ns;
2582 NvmeDsmRange *range;
2583 uint64_t slba;
2584 uint32_t nlb;
2585
2586 if (iocb->ret < 0) {
2587 goto done;
2588 } else if (ret < 0) {
2589 iocb->ret = ret;
2590 goto done;
2591 }
2592
2593 next:
2594 if (iocb->idx == iocb->nr) {
2595 goto done;
2596 }
2597
2598 range = &iocb->range[iocb->idx++];
2599 slba = le64_to_cpu(range->slba);
2600 nlb = le32_to_cpu(range->nlb);
2601
2602 trace_pci_nvme_dsm_deallocate(slba, nlb);
2603
2604 if (nlb > n->dmrsl) {
2605 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2606 goto next;
2607 }
2608
2609 if (nvme_check_bounds(ns, slba, nlb)) {
2610 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2611 ns->id_ns.nsze);
2612 goto next;
2613 }
2614
2615 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2616 nvme_l2b(ns, nlb),
2617 nvme_dsm_md_cb, iocb);
2618 return;
2619
2620 done:
2621 iocb->aiocb = NULL;
2622 iocb->common.cb(iocb->common.opaque, iocb->ret);
2623 g_free(iocb->range);
2624 qemu_aio_unref(iocb);
2625 }
2626
nvme_dsm(NvmeCtrl * n,NvmeRequest * req)2627 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2628 {
2629 NvmeNamespace *ns = req->ns;
2630 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2631 uint32_t attr = le32_to_cpu(dsm->attributes);
2632 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2633 uint16_t status = NVME_SUCCESS;
2634
2635 trace_pci_nvme_dsm(nr, attr);
2636
2637 if (attr & NVME_DSMGMT_AD) {
2638 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2639 nvme_misc_cb, req);
2640
2641 iocb->req = req;
2642 iocb->ret = 0;
2643 iocb->range = g_new(NvmeDsmRange, nr);
2644 iocb->nr = nr;
2645 iocb->idx = 0;
2646
2647 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2648 req);
2649 if (status) {
2650 g_free(iocb->range);
2651 qemu_aio_unref(iocb);
2652
2653 return status;
2654 }
2655
2656 req->aiocb = &iocb->common;
2657 nvme_dsm_cb(iocb, 0);
2658
2659 return NVME_NO_COMPLETE;
2660 }
2661
2662 return status;
2663 }
2664
nvme_verify(NvmeCtrl * n,NvmeRequest * req)2665 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2666 {
2667 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2668 NvmeNamespace *ns = req->ns;
2669 BlockBackend *blk = ns->blkconf.blk;
2670 uint64_t slba = le64_to_cpu(rw->slba);
2671 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2672 size_t len = nvme_l2b(ns, nlb);
2673 size_t data_len = len;
2674 int64_t offset = nvme_l2b(ns, slba);
2675 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2676 uint32_t reftag = le32_to_cpu(rw->reftag);
2677 NvmeBounceContext *ctx = NULL;
2678 uint16_t status;
2679
2680 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2681
2682 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2683 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2684 if (status) {
2685 return status;
2686 }
2687
2688 if (prinfo & NVME_PRINFO_PRACT) {
2689 return NVME_INVALID_PROT_INFO | NVME_DNR;
2690 }
2691 }
2692
2693 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
2694 data_len += nvme_m2b(ns, nlb);
2695 }
2696
2697 if (data_len > (n->page_size << n->params.vsl)) {
2698 return NVME_INVALID_FIELD | NVME_DNR;
2699 }
2700
2701 status = nvme_check_bounds(ns, slba, nlb);
2702 if (status) {
2703 return status;
2704 }
2705
2706 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2707 status = nvme_check_dulbe(ns, slba, nlb);
2708 if (status) {
2709 return status;
2710 }
2711 }
2712
2713 ctx = g_new0(NvmeBounceContext, 1);
2714 ctx->req = req;
2715
2716 ctx->data.bounce = g_malloc(len);
2717
2718 qemu_iovec_init(&ctx->data.iov, 1);
2719 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2720
2721 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2722 BLOCK_ACCT_READ);
2723
2724 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2725 nvme_verify_mdata_in_cb, ctx);
2726 return NVME_NO_COMPLETE;
2727 }
2728
2729 typedef struct NvmeCopyAIOCB {
2730 BlockAIOCB common;
2731 BlockAIOCB *aiocb;
2732 NvmeRequest *req;
2733 NvmeCtrl *n;
2734 int ret;
2735
2736 void *ranges;
2737 unsigned int format;
2738 int nr;
2739 int idx;
2740
2741 uint8_t *bounce;
2742 QEMUIOVector iov;
2743 struct {
2744 BlockAcctCookie read;
2745 BlockAcctCookie write;
2746 } acct;
2747
2748 uint64_t reftag;
2749 uint64_t slba;
2750
2751 NvmeZone *zone;
2752 NvmeNamespace *sns;
2753 uint32_t tcl;
2754 } NvmeCopyAIOCB;
2755
nvme_copy_cancel(BlockAIOCB * aiocb)2756 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2757 {
2758 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2759
2760 iocb->ret = -ECANCELED;
2761
2762 if (iocb->aiocb) {
2763 blk_aio_cancel_async(iocb->aiocb);
2764 iocb->aiocb = NULL;
2765 }
2766 }
2767
2768 static const AIOCBInfo nvme_copy_aiocb_info = {
2769 .aiocb_size = sizeof(NvmeCopyAIOCB),
2770 .cancel_async = nvme_copy_cancel,
2771 };
2772
nvme_copy_done(NvmeCopyAIOCB * iocb)2773 static void nvme_copy_done(NvmeCopyAIOCB *iocb)
2774 {
2775 NvmeRequest *req = iocb->req;
2776 NvmeNamespace *ns = req->ns;
2777 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2778
2779 if (iocb->idx != iocb->nr) {
2780 req->cqe.result = cpu_to_le32(iocb->idx);
2781 }
2782
2783 qemu_iovec_destroy(&iocb->iov);
2784 g_free(iocb->bounce);
2785
2786 if (iocb->ret < 0) {
2787 block_acct_failed(stats, &iocb->acct.read);
2788 block_acct_failed(stats, &iocb->acct.write);
2789 } else {
2790 block_acct_done(stats, &iocb->acct.read);
2791 block_acct_done(stats, &iocb->acct.write);
2792 }
2793
2794 iocb->common.cb(iocb->common.opaque, iocb->ret);
2795 qemu_aio_unref(iocb);
2796 }
2797
2798 static void nvme_do_copy(NvmeCopyAIOCB *iocb);
2799
nvme_copy_source_range_parse_format0_2(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2800 static void nvme_copy_source_range_parse_format0_2(void *ranges,
2801 int idx, uint64_t *slba,
2802 uint32_t *nlb,
2803 uint32_t *snsid,
2804 uint16_t *apptag,
2805 uint16_t *appmask,
2806 uint64_t *reftag)
2807 {
2808 NvmeCopySourceRangeFormat0_2 *_ranges = ranges;
2809
2810 if (snsid) {
2811 *snsid = le32_to_cpu(_ranges[idx].sparams);
2812 }
2813
2814 if (slba) {
2815 *slba = le64_to_cpu(_ranges[idx].slba);
2816 }
2817
2818 if (nlb) {
2819 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2820 }
2821
2822 if (apptag) {
2823 *apptag = le16_to_cpu(_ranges[idx].apptag);
2824 }
2825
2826 if (appmask) {
2827 *appmask = le16_to_cpu(_ranges[idx].appmask);
2828 }
2829
2830 if (reftag) {
2831 *reftag = le32_to_cpu(_ranges[idx].reftag);
2832 }
2833 }
2834
nvme_copy_source_range_parse_format1_3(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2835 static void nvme_copy_source_range_parse_format1_3(void *ranges, int idx,
2836 uint64_t *slba,
2837 uint32_t *nlb,
2838 uint32_t *snsid,
2839 uint16_t *apptag,
2840 uint16_t *appmask,
2841 uint64_t *reftag)
2842 {
2843 NvmeCopySourceRangeFormat1_3 *_ranges = ranges;
2844
2845 if (snsid) {
2846 *snsid = le32_to_cpu(_ranges[idx].sparams);
2847 }
2848
2849 if (slba) {
2850 *slba = le64_to_cpu(_ranges[idx].slba);
2851 }
2852
2853 if (nlb) {
2854 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2855 }
2856
2857 if (apptag) {
2858 *apptag = le16_to_cpu(_ranges[idx].apptag);
2859 }
2860
2861 if (appmask) {
2862 *appmask = le16_to_cpu(_ranges[idx].appmask);
2863 }
2864
2865 if (reftag) {
2866 *reftag = 0;
2867
2868 *reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
2869 *reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
2870 *reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
2871 *reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
2872 *reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
2873 *reftag |= (uint64_t)_ranges[idx].sr[9];
2874 }
2875 }
2876
nvme_copy_source_range_parse(void * ranges,int idx,uint8_t format,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2877 static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
2878 uint64_t *slba, uint32_t *nlb,
2879 uint32_t *snsid, uint16_t *apptag,
2880 uint16_t *appmask, uint64_t *reftag)
2881 {
2882 switch (format) {
2883 case NVME_COPY_FORMAT_0:
2884 case NVME_COPY_FORMAT_2:
2885 nvme_copy_source_range_parse_format0_2(ranges, idx, slba, nlb, snsid,
2886 apptag, appmask, reftag);
2887 break;
2888
2889 case NVME_COPY_FORMAT_1:
2890 case NVME_COPY_FORMAT_3:
2891 nvme_copy_source_range_parse_format1_3(ranges, idx, slba, nlb, snsid,
2892 apptag, appmask, reftag);
2893 break;
2894
2895 default:
2896 abort();
2897 }
2898 }
2899
nvme_check_copy_mcl(NvmeNamespace * ns,NvmeCopyAIOCB * iocb,uint16_t nr)2900 static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
2901 NvmeCopyAIOCB *iocb, uint16_t nr)
2902 {
2903 uint32_t copy_len = 0;
2904
2905 for (int idx = 0; idx < nr; idx++) {
2906 uint32_t nlb;
2907 nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
2908 &nlb, NULL, NULL, NULL, NULL);
2909 copy_len += nlb;
2910 }
2911 iocb->tcl = copy_len;
2912 if (copy_len > ns->id_ns.mcl) {
2913 return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2914 }
2915
2916 return NVME_SUCCESS;
2917 }
2918
nvme_copy_out_completed_cb(void * opaque,int ret)2919 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2920 {
2921 NvmeCopyAIOCB *iocb = opaque;
2922 NvmeRequest *req = iocb->req;
2923 NvmeNamespace *dns = req->ns;
2924 uint32_t nlb;
2925
2926 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2927 &nlb, NULL, NULL, NULL, NULL);
2928
2929 if (ret < 0) {
2930 iocb->ret = ret;
2931 goto out;
2932 } else if (iocb->ret < 0) {
2933 goto out;
2934 }
2935
2936 if (dns->params.zoned) {
2937 nvme_advance_zone_wp(dns, iocb->zone, nlb);
2938 }
2939
2940 iocb->idx++;
2941 iocb->slba += nlb;
2942 out:
2943 nvme_do_copy(iocb);
2944 }
2945
nvme_copy_out_cb(void * opaque,int ret)2946 static void nvme_copy_out_cb(void *opaque, int ret)
2947 {
2948 NvmeCopyAIOCB *iocb = opaque;
2949 NvmeRequest *req = iocb->req;
2950 NvmeNamespace *dns = req->ns;
2951 uint32_t nlb;
2952 size_t mlen;
2953 uint8_t *mbounce;
2954
2955 if (ret < 0 || iocb->ret < 0 || !dns->lbaf.ms) {
2956 goto out;
2957 }
2958
2959 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2960 &nlb, NULL, NULL, NULL, NULL);
2961
2962 mlen = nvme_m2b(dns, nlb);
2963 mbounce = iocb->bounce + nvme_l2b(dns, nlb);
2964
2965 qemu_iovec_reset(&iocb->iov);
2966 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2967
2968 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_moff(dns, iocb->slba),
2969 &iocb->iov, 0, nvme_copy_out_completed_cb,
2970 iocb);
2971
2972 return;
2973
2974 out:
2975 nvme_copy_out_completed_cb(iocb, ret);
2976 }
2977
nvme_copy_in_completed_cb(void * opaque,int ret)2978 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2979 {
2980 NvmeCopyAIOCB *iocb = opaque;
2981 NvmeRequest *req = iocb->req;
2982 NvmeNamespace *sns = iocb->sns;
2983 NvmeNamespace *dns = req->ns;
2984 NvmeCopyCmd *copy = NULL;
2985 uint8_t *mbounce = NULL;
2986 uint32_t nlb;
2987 uint64_t slba;
2988 uint16_t apptag, appmask;
2989 uint64_t reftag;
2990 size_t len, mlen;
2991 uint16_t status;
2992
2993 if (ret < 0) {
2994 iocb->ret = ret;
2995 goto out;
2996 } else if (iocb->ret < 0) {
2997 goto out;
2998 }
2999
3000 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3001 &nlb, NULL, &apptag, &appmask, &reftag);
3002
3003 trace_pci_nvme_copy_out(iocb->slba, nlb);
3004
3005 len = nvme_l2b(sns, nlb);
3006
3007 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps)) {
3008 copy = (NvmeCopyCmd *)&req->cmd;
3009
3010 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3011
3012 mlen = nvme_m2b(sns, nlb);
3013 mbounce = iocb->bounce + nvme_l2b(sns, nlb);
3014
3015 status = nvme_dif_mangle_mdata(sns, mbounce, mlen, slba);
3016 if (status) {
3017 goto invalid;
3018 }
3019 status = nvme_dif_check(sns, iocb->bounce, len, mbounce, mlen, prinfor,
3020 slba, apptag, appmask, &reftag);
3021 if (status) {
3022 goto invalid;
3023 }
3024 }
3025
3026 if (NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3027 copy = (NvmeCopyCmd *)&req->cmd;
3028 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3029
3030 mlen = nvme_m2b(dns, nlb);
3031 mbounce = iocb->bounce + nvme_l2b(dns, nlb);
3032
3033 apptag = le16_to_cpu(copy->apptag);
3034 appmask = le16_to_cpu(copy->appmask);
3035
3036 if (prinfow & NVME_PRINFO_PRACT) {
3037 status = nvme_check_prinfo(dns, prinfow, iocb->slba, iocb->reftag);
3038 if (status) {
3039 goto invalid;
3040 }
3041
3042 nvme_dif_pract_generate_dif(dns, iocb->bounce, len, mbounce, mlen,
3043 apptag, &iocb->reftag);
3044 } else {
3045 status = nvme_dif_check(dns, iocb->bounce, len, mbounce, mlen,
3046 prinfow, iocb->slba, apptag, appmask,
3047 &iocb->reftag);
3048 if (status) {
3049 goto invalid;
3050 }
3051 }
3052 }
3053
3054 status = nvme_check_bounds(dns, iocb->slba, nlb);
3055 if (status) {
3056 goto invalid;
3057 }
3058
3059 if (dns->params.zoned) {
3060 status = nvme_check_zone_write(dns, iocb->zone, iocb->slba, nlb);
3061 if (status) {
3062 goto invalid;
3063 }
3064
3065 if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) {
3066 iocb->zone->w_ptr += nlb;
3067 }
3068 }
3069
3070 qemu_iovec_reset(&iocb->iov);
3071 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3072
3073 block_acct_start(blk_get_stats(dns->blkconf.blk), &iocb->acct.write, 0,
3074 BLOCK_ACCT_WRITE);
3075
3076 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_l2b(dns, iocb->slba),
3077 &iocb->iov, 0, nvme_copy_out_cb, iocb);
3078
3079 return;
3080
3081 invalid:
3082 req->status = status;
3083 iocb->ret = -1;
3084 out:
3085 nvme_do_copy(iocb);
3086 }
3087
nvme_copy_in_cb(void * opaque,int ret)3088 static void nvme_copy_in_cb(void *opaque, int ret)
3089 {
3090 NvmeCopyAIOCB *iocb = opaque;
3091 NvmeNamespace *sns = iocb->sns;
3092 uint64_t slba;
3093 uint32_t nlb;
3094
3095 if (ret < 0 || iocb->ret < 0 || !sns->lbaf.ms) {
3096 goto out;
3097 }
3098
3099 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3100 &nlb, NULL, NULL, NULL, NULL);
3101
3102 qemu_iovec_reset(&iocb->iov);
3103 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(sns, nlb),
3104 nvme_m2b(sns, nlb));
3105
3106 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_moff(sns, slba),
3107 &iocb->iov, 0, nvme_copy_in_completed_cb,
3108 iocb);
3109 return;
3110
3111 out:
3112 nvme_copy_in_completed_cb(iocb, ret);
3113 }
3114
nvme_csi_supports_copy(uint8_t csi)3115 static inline bool nvme_csi_supports_copy(uint8_t csi)
3116 {
3117 return csi == NVME_CSI_NVM || csi == NVME_CSI_ZONED;
3118 }
3119
nvme_copy_ns_format_match(NvmeNamespace * sns,NvmeNamespace * dns)3120 static inline bool nvme_copy_ns_format_match(NvmeNamespace *sns,
3121 NvmeNamespace *dns)
3122 {
3123 return sns->lbaf.ds == dns->lbaf.ds && sns->lbaf.ms == dns->lbaf.ms;
3124 }
3125
nvme_copy_matching_ns_format(NvmeNamespace * sns,NvmeNamespace * dns,bool pi_enable)3126 static bool nvme_copy_matching_ns_format(NvmeNamespace *sns, NvmeNamespace *dns,
3127 bool pi_enable)
3128 {
3129 if (!nvme_csi_supports_copy(sns->csi) ||
3130 !nvme_csi_supports_copy(dns->csi)) {
3131 return false;
3132 }
3133
3134 if (!pi_enable && !nvme_copy_ns_format_match(sns, dns)) {
3135 return false;
3136 }
3137
3138 if (pi_enable && (!nvme_copy_ns_format_match(sns, dns) ||
3139 sns->id_ns.dps != dns->id_ns.dps)) {
3140 return false;
3141 }
3142
3143 return true;
3144 }
3145
nvme_copy_corresp_pi_match(NvmeNamespace * sns,NvmeNamespace * dns)3146 static inline bool nvme_copy_corresp_pi_match(NvmeNamespace *sns,
3147 NvmeNamespace *dns)
3148 {
3149 return sns->lbaf.ms == 0 &&
3150 ((dns->lbaf.ms == 8 && dns->pif == 0) ||
3151 (dns->lbaf.ms == 16 && dns->pif == 1));
3152 }
3153
nvme_copy_corresp_pi_format(NvmeNamespace * sns,NvmeNamespace * dns,bool sns_pi_en)3154 static bool nvme_copy_corresp_pi_format(NvmeNamespace *sns, NvmeNamespace *dns,
3155 bool sns_pi_en)
3156 {
3157 if (!nvme_csi_supports_copy(sns->csi) ||
3158 !nvme_csi_supports_copy(dns->csi)) {
3159 return false;
3160 }
3161
3162 if (!sns_pi_en && !nvme_copy_corresp_pi_match(sns, dns)) {
3163 return false;
3164 }
3165
3166 if (sns_pi_en && !nvme_copy_corresp_pi_match(dns, sns)) {
3167 return false;
3168 }
3169
3170 return true;
3171 }
3172
nvme_do_copy(NvmeCopyAIOCB * iocb)3173 static void nvme_do_copy(NvmeCopyAIOCB *iocb)
3174 {
3175 NvmeRequest *req = iocb->req;
3176 NvmeNamespace *sns;
3177 NvmeNamespace *dns = req->ns;
3178 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3179 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3180 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3181 uint64_t slba;
3182 uint32_t nlb;
3183 size_t len;
3184 uint16_t status;
3185 uint32_t dnsid = le32_to_cpu(req->cmd.nsid);
3186 uint32_t snsid = dnsid;
3187
3188 if (iocb->ret < 0) {
3189 goto done;
3190 }
3191
3192 if (iocb->idx == iocb->nr) {
3193 goto done;
3194 }
3195
3196 if (iocb->format == 2 || iocb->format == 3) {
3197 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3198 &slba, &nlb, &snsid, NULL, NULL, NULL);
3199 if (snsid != dnsid) {
3200 if (snsid == NVME_NSID_BROADCAST ||
3201 !nvme_nsid_valid(iocb->n, snsid)) {
3202 status = NVME_INVALID_NSID | NVME_DNR;
3203 goto invalid;
3204 }
3205 iocb->sns = nvme_ns(iocb->n, snsid);
3206 if (unlikely(!iocb->sns)) {
3207 status = NVME_INVALID_FIELD | NVME_DNR;
3208 goto invalid;
3209 }
3210 } else {
3211 if (((slba + nlb) > iocb->slba) &&
3212 ((slba + nlb) < (iocb->slba + iocb->tcl))) {
3213 status = NVME_CMD_OVERLAP_IO_RANGE | NVME_DNR;
3214 goto invalid;
3215 }
3216 }
3217 } else {
3218 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3219 &slba, &nlb, NULL, NULL, NULL, NULL);
3220 }
3221
3222 sns = iocb->sns;
3223 if ((snsid == dnsid) && NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3224 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
3225 status = NVME_INVALID_FIELD | NVME_DNR;
3226 goto invalid;
3227 } else if (snsid != dnsid) {
3228 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3229 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3230 if (!nvme_copy_matching_ns_format(sns, dns, false)) {
3231 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3232 goto invalid;
3233 }
3234 }
3235 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3236 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3237 if ((prinfor & NVME_PRINFO_PRACT) !=
3238 (prinfow & NVME_PRINFO_PRACT)) {
3239 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3240 goto invalid;
3241 } else {
3242 if (!nvme_copy_matching_ns_format(sns, dns, true)) {
3243 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3244 goto invalid;
3245 }
3246 }
3247 }
3248
3249 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3250 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3251 if (!(prinfow & NVME_PRINFO_PRACT)) {
3252 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3253 goto invalid;
3254 } else {
3255 if (!nvme_copy_corresp_pi_format(sns, dns, false)) {
3256 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3257 goto invalid;
3258 }
3259 }
3260 }
3261
3262 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3263 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3264 if (!(prinfor & NVME_PRINFO_PRACT)) {
3265 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3266 goto invalid;
3267 } else {
3268 if (!nvme_copy_corresp_pi_format(sns, dns, true)) {
3269 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3270 goto invalid;
3271 }
3272 }
3273 }
3274 }
3275 len = nvme_l2b(sns, nlb);
3276
3277 trace_pci_nvme_copy_source_range(slba, nlb);
3278
3279 if (nlb > le16_to_cpu(sns->id_ns.mssrl)) {
3280 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3281 goto invalid;
3282 }
3283
3284 status = nvme_check_bounds(sns, slba, nlb);
3285 if (status) {
3286 goto invalid;
3287 }
3288
3289 if (NVME_ERR_REC_DULBE(sns->features.err_rec)) {
3290 status = nvme_check_dulbe(sns, slba, nlb);
3291 if (status) {
3292 goto invalid;
3293 }
3294 }
3295
3296 if (sns->params.zoned) {
3297 status = nvme_check_zone_read(sns, slba, nlb);
3298 if (status) {
3299 goto invalid;
3300 }
3301 }
3302
3303 g_free(iocb->bounce);
3304 iocb->bounce = g_malloc_n(le16_to_cpu(sns->id_ns.mssrl),
3305 sns->lbasz + sns->lbaf.ms);
3306
3307 qemu_iovec_reset(&iocb->iov);
3308 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3309
3310 block_acct_start(blk_get_stats(sns->blkconf.blk), &iocb->acct.read, 0,
3311 BLOCK_ACCT_READ);
3312
3313 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_l2b(sns, slba),
3314 &iocb->iov, 0, nvme_copy_in_cb, iocb);
3315 return;
3316
3317 invalid:
3318 req->status = status;
3319 iocb->ret = -1;
3320 done:
3321 nvme_copy_done(iocb);
3322 }
3323
nvme_copy(NvmeCtrl * n,NvmeRequest * req)3324 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
3325 {
3326 NvmeNamespace *ns = req->ns;
3327 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3328 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
3329 nvme_misc_cb, req);
3330 uint16_t nr = copy->nr + 1;
3331 uint8_t format = copy->control[0] & 0xf;
3332 size_t len = sizeof(NvmeCopySourceRangeFormat0_2);
3333
3334 uint16_t status;
3335
3336 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
3337
3338 iocb->ranges = NULL;
3339 iocb->zone = NULL;
3340
3341 if (!(n->id_ctrl.ocfs & (1 << format)) ||
3342 ((format == 2 || format == 3) &&
3343 !(n->features.hbs.cdfe & (1 << format)))) {
3344 trace_pci_nvme_err_copy_invalid_format(format);
3345 status = NVME_INVALID_FIELD | NVME_DNR;
3346 goto invalid;
3347 }
3348
3349 if (nr > ns->id_ns.msrc + 1) {
3350 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3351 goto invalid;
3352 }
3353
3354 if ((ns->pif == 0x0 && (format != 0x0 && format != 0x2)) ||
3355 (ns->pif != 0x0 && (format != 0x1 && format != 0x3))) {
3356 status = NVME_INVALID_FORMAT | NVME_DNR;
3357 goto invalid;
3358 }
3359
3360 if (ns->pif) {
3361 len = sizeof(NvmeCopySourceRangeFormat1_3);
3362 }
3363
3364 iocb->format = format;
3365 iocb->ranges = g_malloc_n(nr, len);
3366 status = nvme_h2c(n, (uint8_t *)iocb->ranges, len * nr, req);
3367 if (status) {
3368 goto invalid;
3369 }
3370
3371 iocb->slba = le64_to_cpu(copy->sdlba);
3372
3373 if (ns->params.zoned) {
3374 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
3375 if (!iocb->zone) {
3376 status = NVME_LBA_RANGE | NVME_DNR;
3377 goto invalid;
3378 }
3379
3380 status = nvme_zrm_auto(n, ns, iocb->zone);
3381 if (status) {
3382 goto invalid;
3383 }
3384 }
3385
3386 status = nvme_check_copy_mcl(ns, iocb, nr);
3387 if (status) {
3388 goto invalid;
3389 }
3390
3391 iocb->req = req;
3392 iocb->ret = 0;
3393 iocb->nr = nr;
3394 iocb->idx = 0;
3395 iocb->reftag = le32_to_cpu(copy->reftag);
3396 iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32;
3397
3398 qemu_iovec_init(&iocb->iov, 1);
3399
3400 req->aiocb = &iocb->common;
3401 iocb->sns = req->ns;
3402 iocb->n = n;
3403 iocb->bounce = NULL;
3404 nvme_do_copy(iocb);
3405
3406 return NVME_NO_COMPLETE;
3407
3408 invalid:
3409 g_free(iocb->ranges);
3410 qemu_aio_unref(iocb);
3411 return status;
3412 }
3413
nvme_compare(NvmeCtrl * n,NvmeRequest * req)3414 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
3415 {
3416 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3417 NvmeNamespace *ns = req->ns;
3418 BlockBackend *blk = ns->blkconf.blk;
3419 uint64_t slba = le64_to_cpu(rw->slba);
3420 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
3421 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3422 size_t data_len = nvme_l2b(ns, nlb);
3423 size_t len = data_len;
3424 int64_t offset = nvme_l2b(ns, slba);
3425 struct nvme_compare_ctx *ctx = NULL;
3426 uint16_t status;
3427
3428 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
3429
3430 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
3431 return NVME_INVALID_PROT_INFO | NVME_DNR;
3432 }
3433
3434 if (nvme_ns_ext(ns)) {
3435 len += nvme_m2b(ns, nlb);
3436 }
3437
3438 if (NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt)) {
3439 status = nvme_check_mdts(n, data_len);
3440 } else {
3441 status = nvme_check_mdts(n, len);
3442 }
3443 if (status) {
3444 return status;
3445 }
3446
3447 status = nvme_check_bounds(ns, slba, nlb);
3448 if (status) {
3449 return status;
3450 }
3451
3452 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3453 status = nvme_check_dulbe(ns, slba, nlb);
3454 if (status) {
3455 return status;
3456 }
3457 }
3458
3459 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
3460 if (status) {
3461 return status;
3462 }
3463
3464 ctx = g_new(struct nvme_compare_ctx, 1);
3465 ctx->data.bounce = g_malloc(data_len);
3466
3467 req->opaque = ctx;
3468
3469 qemu_iovec_init(&ctx->data.iov, 1);
3470 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
3471
3472 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
3473 BLOCK_ACCT_READ);
3474 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
3475 nvme_compare_data_cb, req);
3476
3477 return NVME_NO_COMPLETE;
3478 }
3479
3480 typedef struct NvmeFlushAIOCB {
3481 BlockAIOCB common;
3482 BlockAIOCB *aiocb;
3483 NvmeRequest *req;
3484 int ret;
3485
3486 NvmeNamespace *ns;
3487 uint32_t nsid;
3488 bool broadcast;
3489 } NvmeFlushAIOCB;
3490
nvme_flush_cancel(BlockAIOCB * acb)3491 static void nvme_flush_cancel(BlockAIOCB *acb)
3492 {
3493 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
3494
3495 iocb->ret = -ECANCELED;
3496
3497 if (iocb->aiocb) {
3498 blk_aio_cancel_async(iocb->aiocb);
3499 iocb->aiocb = NULL;
3500 }
3501 }
3502
3503 static const AIOCBInfo nvme_flush_aiocb_info = {
3504 .aiocb_size = sizeof(NvmeFlushAIOCB),
3505 .cancel_async = nvme_flush_cancel,
3506 };
3507
3508 static void nvme_do_flush(NvmeFlushAIOCB *iocb);
3509
nvme_flush_ns_cb(void * opaque,int ret)3510 static void nvme_flush_ns_cb(void *opaque, int ret)
3511 {
3512 NvmeFlushAIOCB *iocb = opaque;
3513 NvmeNamespace *ns = iocb->ns;
3514
3515 if (ret < 0) {
3516 iocb->ret = ret;
3517 goto out;
3518 } else if (iocb->ret < 0) {
3519 goto out;
3520 }
3521
3522 if (ns) {
3523 trace_pci_nvme_flush_ns(iocb->nsid);
3524
3525 iocb->ns = NULL;
3526 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
3527 return;
3528 }
3529
3530 out:
3531 nvme_do_flush(iocb);
3532 }
3533
nvme_do_flush(NvmeFlushAIOCB * iocb)3534 static void nvme_do_flush(NvmeFlushAIOCB *iocb)
3535 {
3536 NvmeRequest *req = iocb->req;
3537 NvmeCtrl *n = nvme_ctrl(req);
3538 int i;
3539
3540 if (iocb->ret < 0) {
3541 goto done;
3542 }
3543
3544 if (iocb->broadcast) {
3545 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
3546 iocb->ns = nvme_ns(n, i);
3547 if (iocb->ns) {
3548 iocb->nsid = i;
3549 break;
3550 }
3551 }
3552 }
3553
3554 if (!iocb->ns) {
3555 goto done;
3556 }
3557
3558 nvme_flush_ns_cb(iocb, 0);
3559 return;
3560
3561 done:
3562 iocb->common.cb(iocb->common.opaque, iocb->ret);
3563 qemu_aio_unref(iocb);
3564 }
3565
nvme_flush(NvmeCtrl * n,NvmeRequest * req)3566 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3567 {
3568 NvmeFlushAIOCB *iocb;
3569 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3570 uint16_t status;
3571
3572 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3573
3574 iocb->req = req;
3575 iocb->ret = 0;
3576 iocb->ns = NULL;
3577 iocb->nsid = 0;
3578 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3579
3580 if (!iocb->broadcast) {
3581 if (!nvme_nsid_valid(n, nsid)) {
3582 status = NVME_INVALID_NSID | NVME_DNR;
3583 goto out;
3584 }
3585
3586 iocb->ns = nvme_ns(n, nsid);
3587 if (!iocb->ns) {
3588 status = NVME_INVALID_FIELD | NVME_DNR;
3589 goto out;
3590 }
3591
3592 iocb->nsid = nsid;
3593 }
3594
3595 req->aiocb = &iocb->common;
3596 nvme_do_flush(iocb);
3597
3598 return NVME_NO_COMPLETE;
3599
3600 out:
3601 qemu_aio_unref(iocb);
3602
3603 return status;
3604 }
3605
nvme_read(NvmeCtrl * n,NvmeRequest * req)3606 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3607 {
3608 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3609 NvmeNamespace *ns = req->ns;
3610 uint64_t slba = le64_to_cpu(rw->slba);
3611 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3612 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3613 uint64_t data_size = nvme_l2b(ns, nlb);
3614 uint64_t mapped_size = data_size;
3615 uint64_t data_offset;
3616 BlockBackend *blk = ns->blkconf.blk;
3617 uint16_t status;
3618
3619 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
3620 mapped_size += nvme_m2b(ns, nlb);
3621
3622 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3623 bool pract = prinfo & NVME_PRINFO_PRACT;
3624
3625 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3626 mapped_size = data_size;
3627 }
3628 }
3629 }
3630
3631 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3632
3633 status = nvme_check_mdts(n, mapped_size);
3634 if (status) {
3635 goto invalid;
3636 }
3637
3638 status = nvme_check_bounds(ns, slba, nlb);
3639 if (status) {
3640 goto invalid;
3641 }
3642
3643 if (ns->params.zoned) {
3644 status = nvme_check_zone_read(ns, slba, nlb);
3645 if (status) {
3646 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3647 goto invalid;
3648 }
3649 }
3650
3651 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3652 status = nvme_check_dulbe(ns, slba, nlb);
3653 if (status) {
3654 goto invalid;
3655 }
3656 }
3657
3658 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3659 return nvme_dif_rw(n, req);
3660 }
3661
3662 status = nvme_map_data(n, nlb, req);
3663 if (status) {
3664 goto invalid;
3665 }
3666
3667 data_offset = nvme_l2b(ns, slba);
3668
3669 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3670 BLOCK_ACCT_READ);
3671 nvme_blk_read(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3672 return NVME_NO_COMPLETE;
3673
3674 invalid:
3675 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3676 return status | NVME_DNR;
3677 }
3678
nvme_do_write_fdp(NvmeCtrl * n,NvmeRequest * req,uint64_t slba,uint32_t nlb)3679 static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba,
3680 uint32_t nlb)
3681 {
3682 NvmeNamespace *ns = req->ns;
3683 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3684 uint64_t data_size = nvme_l2b(ns, nlb);
3685 uint32_t dw12 = le32_to_cpu(req->cmd.cdw12);
3686 uint8_t dtype = (dw12 >> 20) & 0xf;
3687 uint16_t pid = le16_to_cpu(rw->dspec);
3688 uint16_t ph, rg, ruhid;
3689 NvmeReclaimUnit *ru;
3690
3691 if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT ||
3692 !nvme_parse_pid(ns, pid, &ph, &rg)) {
3693 ph = 0;
3694 rg = 0;
3695 }
3696
3697 ruhid = ns->fdp.phs[ph];
3698 ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg];
3699
3700 nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size);
3701 nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size);
3702
3703 while (nlb) {
3704 if (nlb < ru->ruamw) {
3705 ru->ruamw -= nlb;
3706 break;
3707 }
3708
3709 nlb -= ru->ruamw;
3710 nvme_update_ruh(n, ns, pid);
3711 }
3712 }
3713
nvme_do_write(NvmeCtrl * n,NvmeRequest * req,bool append,bool wrz)3714 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3715 bool wrz)
3716 {
3717 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3718 NvmeNamespace *ns = req->ns;
3719 uint64_t slba = le64_to_cpu(rw->slba);
3720 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3721 uint16_t ctrl = le16_to_cpu(rw->control);
3722 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3723 uint64_t data_size = nvme_l2b(ns, nlb);
3724 uint64_t mapped_size = data_size;
3725 uint64_t data_offset;
3726 NvmeZone *zone;
3727 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3728 BlockBackend *blk = ns->blkconf.blk;
3729 uint16_t status;
3730
3731 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
3732 mapped_size += nvme_m2b(ns, nlb);
3733
3734 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3735 bool pract = prinfo & NVME_PRINFO_PRACT;
3736
3737 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3738 mapped_size -= nvme_m2b(ns, nlb);
3739 }
3740 }
3741 }
3742
3743 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3744 nvme_nsid(ns), nlb, mapped_size, slba);
3745
3746 if (!wrz) {
3747 status = nvme_check_mdts(n, mapped_size);
3748 if (status) {
3749 goto invalid;
3750 }
3751 }
3752
3753 status = nvme_check_bounds(ns, slba, nlb);
3754 if (status) {
3755 goto invalid;
3756 }
3757
3758 if (ns->params.zoned) {
3759 zone = nvme_get_zone_by_slba(ns, slba);
3760 assert(zone);
3761
3762 if (append) {
3763 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3764
3765 if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3766 return NVME_INVALID_ZONE_OP | NVME_DNR;
3767 }
3768
3769 if (unlikely(slba != zone->d.zslba)) {
3770 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3771 status = NVME_INVALID_FIELD;
3772 goto invalid;
3773 }
3774
3775 if (n->params.zasl &&
3776 data_size > (uint64_t)n->page_size << n->params.zasl) {
3777 trace_pci_nvme_err_zasl(data_size);
3778 return NVME_INVALID_FIELD | NVME_DNR;
3779 }
3780
3781 slba = zone->w_ptr;
3782 rw->slba = cpu_to_le64(slba);
3783 res->slba = cpu_to_le64(slba);
3784
3785 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3786 case NVME_ID_NS_DPS_TYPE_1:
3787 if (!piremap) {
3788 return NVME_INVALID_PROT_INFO | NVME_DNR;
3789 }
3790
3791 /* fallthrough */
3792
3793 case NVME_ID_NS_DPS_TYPE_2:
3794 if (piremap) {
3795 uint32_t reftag = le32_to_cpu(rw->reftag);
3796 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3797 }
3798
3799 break;
3800
3801 case NVME_ID_NS_DPS_TYPE_3:
3802 if (piremap) {
3803 return NVME_INVALID_PROT_INFO | NVME_DNR;
3804 }
3805
3806 break;
3807 }
3808 }
3809
3810 status = nvme_check_zone_write(ns, zone, slba, nlb);
3811 if (status) {
3812 goto invalid;
3813 }
3814
3815 status = nvme_zrm_auto(n, ns, zone);
3816 if (status) {
3817 goto invalid;
3818 }
3819
3820 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3821 zone->w_ptr += nlb;
3822 }
3823 } else if (ns->endgrp && ns->endgrp->fdp.enabled) {
3824 nvme_do_write_fdp(n, req, slba, nlb);
3825 }
3826
3827 data_offset = nvme_l2b(ns, slba);
3828
3829 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3830 return nvme_dif_rw(n, req);
3831 }
3832
3833 if (!wrz) {
3834 status = nvme_map_data(n, nlb, req);
3835 if (status) {
3836 goto invalid;
3837 }
3838
3839 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3840 BLOCK_ACCT_WRITE);
3841 nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3842 } else {
3843 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3844 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3845 req);
3846 }
3847
3848 return NVME_NO_COMPLETE;
3849
3850 invalid:
3851 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3852 return status | NVME_DNR;
3853 }
3854
nvme_write(NvmeCtrl * n,NvmeRequest * req)3855 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3856 {
3857 return nvme_do_write(n, req, false, false);
3858 }
3859
nvme_write_zeroes(NvmeCtrl * n,NvmeRequest * req)3860 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3861 {
3862 return nvme_do_write(n, req, false, true);
3863 }
3864
nvme_zone_append(NvmeCtrl * n,NvmeRequest * req)3865 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3866 {
3867 return nvme_do_write(n, req, true, false);
3868 }
3869
nvme_get_mgmt_zone_slba_idx(NvmeNamespace * ns,NvmeCmd * c,uint64_t * slba,uint32_t * zone_idx)3870 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3871 uint64_t *slba, uint32_t *zone_idx)
3872 {
3873 uint32_t dw10 = le32_to_cpu(c->cdw10);
3874 uint32_t dw11 = le32_to_cpu(c->cdw11);
3875
3876 if (!ns->params.zoned) {
3877 trace_pci_nvme_err_invalid_opc(c->opcode);
3878 return NVME_INVALID_OPCODE | NVME_DNR;
3879 }
3880
3881 *slba = ((uint64_t)dw11) << 32 | dw10;
3882 if (unlikely(*slba >= ns->id_ns.nsze)) {
3883 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3884 *slba = 0;
3885 return NVME_LBA_RANGE | NVME_DNR;
3886 }
3887
3888 *zone_idx = nvme_zone_idx(ns, *slba);
3889 assert(*zone_idx < ns->num_zones);
3890
3891 return NVME_SUCCESS;
3892 }
3893
3894 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3895 NvmeRequest *);
3896
3897 enum NvmeZoneProcessingMask {
3898 NVME_PROC_CURRENT_ZONE = 0,
3899 NVME_PROC_OPENED_ZONES = 1 << 0,
3900 NVME_PROC_CLOSED_ZONES = 1 << 1,
3901 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3902 NVME_PROC_FULL_ZONES = 1 << 3,
3903 };
3904
nvme_open_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3905 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3906 NvmeZoneState state, NvmeRequest *req)
3907 {
3908 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3909 int flags = 0;
3910
3911 if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) {
3912 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3913
3914 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3915 return NVME_INVALID_ZONE_OP | NVME_DNR;
3916 }
3917
3918 if (zone->w_ptr % ns->zns.zrwafg) {
3919 return NVME_NOZRWA | NVME_DNR;
3920 }
3921
3922 flags = NVME_ZRM_ZRWA;
3923 }
3924
3925 return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags);
3926 }
3927
nvme_close_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3928 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3929 NvmeZoneState state, NvmeRequest *req)
3930 {
3931 return nvme_zrm_close(ns, zone);
3932 }
3933
nvme_finish_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3934 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3935 NvmeZoneState state, NvmeRequest *req)
3936 {
3937 return nvme_zrm_finish(ns, zone);
3938 }
3939
nvme_offline_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3940 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3941 NvmeZoneState state, NvmeRequest *req)
3942 {
3943 switch (state) {
3944 case NVME_ZONE_STATE_READ_ONLY:
3945 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3946 /* fall through */
3947 case NVME_ZONE_STATE_OFFLINE:
3948 return NVME_SUCCESS;
3949 default:
3950 return NVME_ZONE_INVAL_TRANSITION;
3951 }
3952 }
3953
nvme_set_zd_ext(NvmeNamespace * ns,NvmeZone * zone)3954 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3955 {
3956 uint16_t status;
3957 uint8_t state = nvme_get_zone_state(zone);
3958
3959 if (state == NVME_ZONE_STATE_EMPTY) {
3960 status = nvme_aor_check(ns, 1, 0);
3961 if (status) {
3962 return status;
3963 }
3964 nvme_aor_inc_active(ns);
3965 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3966 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3967 return NVME_SUCCESS;
3968 }
3969
3970 return NVME_ZONE_INVAL_TRANSITION;
3971 }
3972
nvme_bulk_proc_zone(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)3973 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3974 enum NvmeZoneProcessingMask proc_mask,
3975 op_handler_t op_hndlr, NvmeRequest *req)
3976 {
3977 uint16_t status = NVME_SUCCESS;
3978 NvmeZoneState zs = nvme_get_zone_state(zone);
3979 bool proc_zone;
3980
3981 switch (zs) {
3982 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3983 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3984 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3985 break;
3986 case NVME_ZONE_STATE_CLOSED:
3987 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3988 break;
3989 case NVME_ZONE_STATE_READ_ONLY:
3990 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3991 break;
3992 case NVME_ZONE_STATE_FULL:
3993 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3994 break;
3995 default:
3996 proc_zone = false;
3997 }
3998
3999 if (proc_zone) {
4000 status = op_hndlr(ns, zone, zs, req);
4001 }
4002
4003 return status;
4004 }
4005
nvme_do_zone_op(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)4006 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
4007 enum NvmeZoneProcessingMask proc_mask,
4008 op_handler_t op_hndlr, NvmeRequest *req)
4009 {
4010 NvmeZone *next;
4011 uint16_t status = NVME_SUCCESS;
4012 int i;
4013
4014 if (!proc_mask) {
4015 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
4016 } else {
4017 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
4018 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
4019 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4020 req);
4021 if (status && status != NVME_NO_COMPLETE) {
4022 goto out;
4023 }
4024 }
4025 }
4026 if (proc_mask & NVME_PROC_OPENED_ZONES) {
4027 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
4028 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4029 req);
4030 if (status && status != NVME_NO_COMPLETE) {
4031 goto out;
4032 }
4033 }
4034
4035 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
4036 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4037 req);
4038 if (status && status != NVME_NO_COMPLETE) {
4039 goto out;
4040 }
4041 }
4042 }
4043 if (proc_mask & NVME_PROC_FULL_ZONES) {
4044 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
4045 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4046 req);
4047 if (status && status != NVME_NO_COMPLETE) {
4048 goto out;
4049 }
4050 }
4051 }
4052
4053 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
4054 for (i = 0; i < ns->num_zones; i++, zone++) {
4055 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4056 req);
4057 if (status && status != NVME_NO_COMPLETE) {
4058 goto out;
4059 }
4060 }
4061 }
4062 }
4063
4064 out:
4065 return status;
4066 }
4067
4068 typedef struct NvmeZoneResetAIOCB {
4069 BlockAIOCB common;
4070 BlockAIOCB *aiocb;
4071 NvmeRequest *req;
4072 int ret;
4073
4074 bool all;
4075 int idx;
4076 NvmeZone *zone;
4077 } NvmeZoneResetAIOCB;
4078
nvme_zone_reset_cancel(BlockAIOCB * aiocb)4079 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
4080 {
4081 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
4082 NvmeRequest *req = iocb->req;
4083 NvmeNamespace *ns = req->ns;
4084
4085 iocb->idx = ns->num_zones;
4086
4087 iocb->ret = -ECANCELED;
4088
4089 if (iocb->aiocb) {
4090 blk_aio_cancel_async(iocb->aiocb);
4091 iocb->aiocb = NULL;
4092 }
4093 }
4094
4095 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
4096 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
4097 .cancel_async = nvme_zone_reset_cancel,
4098 };
4099
4100 static void nvme_zone_reset_cb(void *opaque, int ret);
4101
nvme_zone_reset_epilogue_cb(void * opaque,int ret)4102 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
4103 {
4104 NvmeZoneResetAIOCB *iocb = opaque;
4105 NvmeRequest *req = iocb->req;
4106 NvmeNamespace *ns = req->ns;
4107 int64_t moff;
4108 int count;
4109
4110 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
4111 goto out;
4112 }
4113
4114 moff = nvme_moff(ns, iocb->zone->d.zslba);
4115 count = nvme_m2b(ns, ns->zone_size);
4116
4117 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
4118 BDRV_REQ_MAY_UNMAP,
4119 nvme_zone_reset_cb, iocb);
4120 return;
4121
4122 out:
4123 nvme_zone_reset_cb(iocb, ret);
4124 }
4125
nvme_zone_reset_cb(void * opaque,int ret)4126 static void nvme_zone_reset_cb(void *opaque, int ret)
4127 {
4128 NvmeZoneResetAIOCB *iocb = opaque;
4129 NvmeRequest *req = iocb->req;
4130 NvmeNamespace *ns = req->ns;
4131
4132 if (iocb->ret < 0) {
4133 goto done;
4134 } else if (ret < 0) {
4135 iocb->ret = ret;
4136 goto done;
4137 }
4138
4139 if (iocb->zone) {
4140 nvme_zrm_reset(ns, iocb->zone);
4141
4142 if (!iocb->all) {
4143 goto done;
4144 }
4145 }
4146
4147 while (iocb->idx < ns->num_zones) {
4148 NvmeZone *zone = &ns->zone_array[iocb->idx++];
4149
4150 switch (nvme_get_zone_state(zone)) {
4151 case NVME_ZONE_STATE_EMPTY:
4152 if (!iocb->all) {
4153 goto done;
4154 }
4155
4156 continue;
4157
4158 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
4159 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
4160 case NVME_ZONE_STATE_CLOSED:
4161 case NVME_ZONE_STATE_FULL:
4162 iocb->zone = zone;
4163 break;
4164
4165 default:
4166 continue;
4167 }
4168
4169 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
4170
4171 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
4172 nvme_l2b(ns, zone->d.zslba),
4173 nvme_l2b(ns, ns->zone_size),
4174 BDRV_REQ_MAY_UNMAP,
4175 nvme_zone_reset_epilogue_cb,
4176 iocb);
4177 return;
4178 }
4179
4180 done:
4181 iocb->aiocb = NULL;
4182
4183 iocb->common.cb(iocb->common.opaque, iocb->ret);
4184 qemu_aio_unref(iocb);
4185 }
4186
nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl * n,NvmeZone * zone,uint64_t elba,NvmeRequest * req)4187 static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone,
4188 uint64_t elba, NvmeRequest *req)
4189 {
4190 NvmeNamespace *ns = req->ns;
4191 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
4192 uint64_t wp = zone->d.wp;
4193 uint32_t nlb = elba - wp + 1;
4194 uint16_t status;
4195
4196
4197 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
4198 return NVME_INVALID_ZONE_OP | NVME_DNR;
4199 }
4200
4201 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
4202 return NVME_INVALID_FIELD | NVME_DNR;
4203 }
4204
4205 if (elba < wp || elba > wp + ns->zns.zrwas) {
4206 return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
4207 }
4208
4209 if (nlb % ns->zns.zrwafg) {
4210 return NVME_INVALID_FIELD | NVME_DNR;
4211 }
4212
4213 status = nvme_zrm_auto(n, ns, zone);
4214 if (status) {
4215 return status;
4216 }
4217
4218 zone->w_ptr += nlb;
4219
4220 nvme_advance_zone_wp(ns, zone, nlb);
4221
4222 return NVME_SUCCESS;
4223 }
4224
nvme_zone_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4225 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4226 {
4227 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
4228 NvmeNamespace *ns = req->ns;
4229 NvmeZone *zone;
4230 NvmeZoneResetAIOCB *iocb;
4231 uint8_t *zd_ext;
4232 uint64_t slba = 0;
4233 uint32_t zone_idx = 0;
4234 uint16_t status;
4235 uint8_t action = cmd->zsa;
4236 bool all;
4237 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
4238
4239 all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
4240
4241 req->status = NVME_SUCCESS;
4242
4243 if (!all) {
4244 status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
4245 if (status) {
4246 return status;
4247 }
4248 }
4249
4250 zone = &ns->zone_array[zone_idx];
4251 if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) {
4252 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
4253 return NVME_INVALID_FIELD | NVME_DNR;
4254 }
4255
4256 switch (action) {
4257
4258 case NVME_ZONE_ACTION_OPEN:
4259 if (all) {
4260 proc_mask = NVME_PROC_CLOSED_ZONES;
4261 }
4262 trace_pci_nvme_open_zone(slba, zone_idx, all);
4263 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
4264 break;
4265
4266 case NVME_ZONE_ACTION_CLOSE:
4267 if (all) {
4268 proc_mask = NVME_PROC_OPENED_ZONES;
4269 }
4270 trace_pci_nvme_close_zone(slba, zone_idx, all);
4271 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
4272 break;
4273
4274 case NVME_ZONE_ACTION_FINISH:
4275 if (all) {
4276 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
4277 }
4278 trace_pci_nvme_finish_zone(slba, zone_idx, all);
4279 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
4280 break;
4281
4282 case NVME_ZONE_ACTION_RESET:
4283 trace_pci_nvme_reset_zone(slba, zone_idx, all);
4284
4285 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
4286 nvme_misc_cb, req);
4287
4288 iocb->req = req;
4289 iocb->ret = 0;
4290 iocb->all = all;
4291 iocb->idx = zone_idx;
4292 iocb->zone = NULL;
4293
4294 req->aiocb = &iocb->common;
4295 nvme_zone_reset_cb(iocb, 0);
4296
4297 return NVME_NO_COMPLETE;
4298
4299 case NVME_ZONE_ACTION_OFFLINE:
4300 if (all) {
4301 proc_mask = NVME_PROC_READ_ONLY_ZONES;
4302 }
4303 trace_pci_nvme_offline_zone(slba, zone_idx, all);
4304 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
4305 break;
4306
4307 case NVME_ZONE_ACTION_SET_ZD_EXT:
4308 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
4309 if (all || !ns->params.zd_extension_size) {
4310 return NVME_INVALID_FIELD | NVME_DNR;
4311 }
4312 zd_ext = nvme_get_zd_extension(ns, zone_idx);
4313 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
4314 if (status) {
4315 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
4316 return status;
4317 }
4318
4319 status = nvme_set_zd_ext(ns, zone);
4320 if (status == NVME_SUCCESS) {
4321 trace_pci_nvme_zd_extension_set(zone_idx);
4322 return status;
4323 }
4324 break;
4325
4326 case NVME_ZONE_ACTION_ZRWA_FLUSH:
4327 if (all) {
4328 return NVME_INVALID_FIELD | NVME_DNR;
4329 }
4330
4331 return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req);
4332
4333 default:
4334 trace_pci_nvme_err_invalid_mgmt_action(action);
4335 status = NVME_INVALID_FIELD;
4336 }
4337
4338 if (status == NVME_ZONE_INVAL_TRANSITION) {
4339 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
4340 zone->d.za);
4341 }
4342 if (status) {
4343 status |= NVME_DNR;
4344 }
4345
4346 return status;
4347 }
4348
nvme_zone_matches_filter(uint32_t zafs,NvmeZone * zl)4349 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
4350 {
4351 NvmeZoneState zs = nvme_get_zone_state(zl);
4352
4353 switch (zafs) {
4354 case NVME_ZONE_REPORT_ALL:
4355 return true;
4356 case NVME_ZONE_REPORT_EMPTY:
4357 return zs == NVME_ZONE_STATE_EMPTY;
4358 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
4359 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
4360 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
4361 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
4362 case NVME_ZONE_REPORT_CLOSED:
4363 return zs == NVME_ZONE_STATE_CLOSED;
4364 case NVME_ZONE_REPORT_FULL:
4365 return zs == NVME_ZONE_STATE_FULL;
4366 case NVME_ZONE_REPORT_READ_ONLY:
4367 return zs == NVME_ZONE_STATE_READ_ONLY;
4368 case NVME_ZONE_REPORT_OFFLINE:
4369 return zs == NVME_ZONE_STATE_OFFLINE;
4370 default:
4371 return false;
4372 }
4373 }
4374
nvme_zone_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4375 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4376 {
4377 NvmeCmd *cmd = &req->cmd;
4378 NvmeNamespace *ns = req->ns;
4379 /* cdw12 is zero-based number of dwords to return. Convert to bytes */
4380 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
4381 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4382 uint32_t zone_idx, zra, zrasf, partial;
4383 uint64_t max_zones, nr_zones = 0;
4384 uint16_t status;
4385 uint64_t slba;
4386 NvmeZoneDescr *z;
4387 NvmeZone *zone;
4388 NvmeZoneReportHeader *header;
4389 void *buf, *buf_p;
4390 size_t zone_entry_sz;
4391 int i;
4392
4393 req->status = NVME_SUCCESS;
4394
4395 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
4396 if (status) {
4397 return status;
4398 }
4399
4400 zra = dw13 & 0xff;
4401 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
4402 return NVME_INVALID_FIELD | NVME_DNR;
4403 }
4404 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
4405 return NVME_INVALID_FIELD | NVME_DNR;
4406 }
4407
4408 zrasf = (dw13 >> 8) & 0xff;
4409 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
4410 return NVME_INVALID_FIELD | NVME_DNR;
4411 }
4412
4413 if (data_size < sizeof(NvmeZoneReportHeader)) {
4414 return NVME_INVALID_FIELD | NVME_DNR;
4415 }
4416
4417 status = nvme_check_mdts(n, data_size);
4418 if (status) {
4419 return status;
4420 }
4421
4422 partial = (dw13 >> 16) & 0x01;
4423
4424 zone_entry_sz = sizeof(NvmeZoneDescr);
4425 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4426 zone_entry_sz += ns->params.zd_extension_size;
4427 }
4428
4429 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
4430 buf = g_malloc0(data_size);
4431
4432 zone = &ns->zone_array[zone_idx];
4433 for (i = zone_idx; i < ns->num_zones; i++) {
4434 if (partial && nr_zones >= max_zones) {
4435 break;
4436 }
4437 if (nvme_zone_matches_filter(zrasf, zone++)) {
4438 nr_zones++;
4439 }
4440 }
4441 header = buf;
4442 header->nr_zones = cpu_to_le64(nr_zones);
4443
4444 buf_p = buf + sizeof(NvmeZoneReportHeader);
4445 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
4446 zone = &ns->zone_array[zone_idx];
4447 if (nvme_zone_matches_filter(zrasf, zone)) {
4448 z = buf_p;
4449 buf_p += sizeof(NvmeZoneDescr);
4450
4451 z->zt = zone->d.zt;
4452 z->zs = zone->d.zs;
4453 z->zcap = cpu_to_le64(zone->d.zcap);
4454 z->zslba = cpu_to_le64(zone->d.zslba);
4455 z->za = zone->d.za;
4456
4457 if (nvme_wp_is_valid(zone)) {
4458 z->wp = cpu_to_le64(zone->d.wp);
4459 } else {
4460 z->wp = cpu_to_le64(~0ULL);
4461 }
4462
4463 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4464 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
4465 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
4466 ns->params.zd_extension_size);
4467 }
4468 buf_p += ns->params.zd_extension_size;
4469 }
4470
4471 max_zones--;
4472 }
4473 }
4474
4475 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
4476
4477 g_free(buf);
4478
4479 return status;
4480 }
4481
nvme_io_mgmt_recv_ruhs(NvmeCtrl * n,NvmeRequest * req,size_t len)4482 static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
4483 size_t len)
4484 {
4485 NvmeNamespace *ns = req->ns;
4486 NvmeEnduranceGroup *endgrp;
4487 NvmeRuhStatus *hdr;
4488 NvmeRuhStatusDescr *ruhsd;
4489 unsigned int nruhsd;
4490 uint16_t rg, ph, *ruhid;
4491 size_t trans_len;
4492 g_autofree uint8_t *buf = NULL;
4493
4494 if (!n->subsys) {
4495 return NVME_INVALID_FIELD | NVME_DNR;
4496 }
4497
4498 if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) {
4499 return NVME_INVALID_NSID | NVME_DNR;
4500 }
4501
4502 if (!n->subsys->endgrp.fdp.enabled) {
4503 return NVME_FDP_DISABLED | NVME_DNR;
4504 }
4505
4506 endgrp = ns->endgrp;
4507
4508 nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
4509 trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
4510 buf = g_malloc0(trans_len);
4511
4512 trans_len = MIN(trans_len, len);
4513
4514 hdr = (NvmeRuhStatus *)buf;
4515 ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus));
4516
4517 hdr->nruhsd = cpu_to_le16(nruhsd);
4518
4519 ruhid = ns->fdp.phs;
4520
4521 for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) {
4522 NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid];
4523
4524 for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) {
4525 uint16_t pid = nvme_make_pid(ns, rg, ph);
4526
4527 ruhsd->pid = cpu_to_le16(pid);
4528 ruhsd->ruhid = *ruhid;
4529 ruhsd->earutr = 0;
4530 ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw);
4531 }
4532 }
4533
4534 return nvme_c2h(n, buf, trans_len, req);
4535 }
4536
nvme_io_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4537 static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4538 {
4539 NvmeCmd *cmd = &req->cmd;
4540 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4541 uint32_t numd = le32_to_cpu(cmd->cdw11);
4542 uint8_t mo = (cdw10 & 0xff);
4543 size_t len = (numd + 1) << 2;
4544
4545 switch (mo) {
4546 case NVME_IOMR_MO_NOP:
4547 return 0;
4548 case NVME_IOMR_MO_RUH_STATUS:
4549 return nvme_io_mgmt_recv_ruhs(n, req, len);
4550 default:
4551 return NVME_INVALID_FIELD | NVME_DNR;
4552 };
4553 }
4554
nvme_io_mgmt_send_ruh_update(NvmeCtrl * n,NvmeRequest * req)4555 static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
4556 {
4557 NvmeCmd *cmd = &req->cmd;
4558 NvmeNamespace *ns = req->ns;
4559 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4560 uint16_t ret = NVME_SUCCESS;
4561 uint32_t npid = (cdw10 >> 16) + 1;
4562 unsigned int i = 0;
4563 g_autofree uint16_t *pids = NULL;
4564 uint32_t maxnpid;
4565
4566 if (!ns->endgrp || !ns->endgrp->fdp.enabled) {
4567 return NVME_FDP_DISABLED | NVME_DNR;
4568 }
4569
4570 maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
4571
4572 if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
4573 return NVME_INVALID_FIELD | NVME_DNR;
4574 }
4575
4576 pids = g_new(uint16_t, npid);
4577
4578 ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req);
4579 if (ret) {
4580 return ret;
4581 }
4582
4583 for (; i < npid; i++) {
4584 if (!nvme_update_ruh(n, ns, pids[i])) {
4585 return NVME_INVALID_FIELD | NVME_DNR;
4586 }
4587 }
4588
4589 return ret;
4590 }
4591
nvme_io_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4592 static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4593 {
4594 NvmeCmd *cmd = &req->cmd;
4595 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4596 uint8_t mo = (cdw10 & 0xff);
4597
4598 switch (mo) {
4599 case NVME_IOMS_MO_NOP:
4600 return 0;
4601 case NVME_IOMS_MO_RUH_UPDATE:
4602 return nvme_io_mgmt_send_ruh_update(n, req);
4603 default:
4604 return NVME_INVALID_FIELD | NVME_DNR;
4605 };
4606 }
4607
nvme_io_cmd(NvmeCtrl * n,NvmeRequest * req)4608 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
4609 {
4610 NvmeNamespace *ns;
4611 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4612
4613 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
4614 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
4615
4616 /*
4617 * In the base NVM command set, Flush may apply to all namespaces
4618 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
4619 * along with TP 4056 (Namespace Types), it may be pretty screwed up.
4620 *
4621 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
4622 * opcode with a specific command since we cannot determine a unique I/O
4623 * command set. Opcode 0h could have any other meaning than something
4624 * equivalent to flushing and say it DOES have completely different
4625 * semantics in some other command set - does an NSID of FFFFFFFFh then
4626 * mean "for all namespaces, apply whatever command set specific command
4627 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
4628 * whatever command that uses the 0h opcode if, and only if, it allows NSID
4629 * to be FFFFFFFFh"?
4630 *
4631 * Anyway (and luckily), for now, we do not care about this since the
4632 * device only supports namespace types that includes the NVM Flush command
4633 * (NVM and Zoned), so always do an NVM Flush.
4634 */
4635
4636 if (req->cmd.opcode == NVME_CMD_FLUSH) {
4637 return nvme_flush(n, req);
4638 }
4639
4640 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4641 return NVME_INVALID_NSID | NVME_DNR;
4642 }
4643
4644 ns = nvme_ns(n, nsid);
4645 if (unlikely(!ns)) {
4646 return NVME_INVALID_FIELD | NVME_DNR;
4647 }
4648
4649 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4650 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4651 return NVME_INVALID_OPCODE | NVME_DNR;
4652 }
4653
4654 if (ns->status) {
4655 return ns->status;
4656 }
4657
4658 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
4659 return NVME_INVALID_FIELD;
4660 }
4661
4662 req->ns = ns;
4663
4664 switch (req->cmd.opcode) {
4665 case NVME_CMD_WRITE_ZEROES:
4666 return nvme_write_zeroes(n, req);
4667 case NVME_CMD_ZONE_APPEND:
4668 return nvme_zone_append(n, req);
4669 case NVME_CMD_WRITE:
4670 return nvme_write(n, req);
4671 case NVME_CMD_READ:
4672 return nvme_read(n, req);
4673 case NVME_CMD_COMPARE:
4674 return nvme_compare(n, req);
4675 case NVME_CMD_DSM:
4676 return nvme_dsm(n, req);
4677 case NVME_CMD_VERIFY:
4678 return nvme_verify(n, req);
4679 case NVME_CMD_COPY:
4680 return nvme_copy(n, req);
4681 case NVME_CMD_ZONE_MGMT_SEND:
4682 return nvme_zone_mgmt_send(n, req);
4683 case NVME_CMD_ZONE_MGMT_RECV:
4684 return nvme_zone_mgmt_recv(n, req);
4685 case NVME_CMD_IO_MGMT_RECV:
4686 return nvme_io_mgmt_recv(n, req);
4687 case NVME_CMD_IO_MGMT_SEND:
4688 return nvme_io_mgmt_send(n, req);
4689 default:
4690 g_assert_not_reached();
4691 }
4692
4693 return NVME_INVALID_OPCODE | NVME_DNR;
4694 }
4695
nvme_cq_notifier(EventNotifier * e)4696 static void nvme_cq_notifier(EventNotifier *e)
4697 {
4698 NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
4699 NvmeCtrl *n = cq->ctrl;
4700
4701 if (!event_notifier_test_and_clear(e)) {
4702 return;
4703 }
4704
4705 nvme_update_cq_head(cq);
4706
4707 if (cq->tail == cq->head) {
4708 if (cq->irq_enabled) {
4709 n->cq_pending--;
4710 }
4711
4712 nvme_irq_deassert(n, cq);
4713 }
4714
4715 qemu_bh_schedule(cq->bh);
4716 }
4717
nvme_init_cq_ioeventfd(NvmeCQueue * cq)4718 static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
4719 {
4720 NvmeCtrl *n = cq->ctrl;
4721 uint16_t offset = (cq->cqid << 3) + (1 << 2);
4722 int ret;
4723
4724 ret = event_notifier_init(&cq->notifier, 0);
4725 if (ret < 0) {
4726 return ret;
4727 }
4728
4729 event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
4730 memory_region_add_eventfd(&n->iomem,
4731 0x1000 + offset, 4, false, 0, &cq->notifier);
4732
4733 return 0;
4734 }
4735
nvme_sq_notifier(EventNotifier * e)4736 static void nvme_sq_notifier(EventNotifier *e)
4737 {
4738 NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
4739
4740 if (!event_notifier_test_and_clear(e)) {
4741 return;
4742 }
4743
4744 nvme_process_sq(sq);
4745 }
4746
nvme_init_sq_ioeventfd(NvmeSQueue * sq)4747 static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
4748 {
4749 NvmeCtrl *n = sq->ctrl;
4750 uint16_t offset = sq->sqid << 3;
4751 int ret;
4752
4753 ret = event_notifier_init(&sq->notifier, 0);
4754 if (ret < 0) {
4755 return ret;
4756 }
4757
4758 event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
4759 memory_region_add_eventfd(&n->iomem,
4760 0x1000 + offset, 4, false, 0, &sq->notifier);
4761
4762 return 0;
4763 }
4764
nvme_free_sq(NvmeSQueue * sq,NvmeCtrl * n)4765 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
4766 {
4767 uint16_t offset = sq->sqid << 3;
4768
4769 n->sq[sq->sqid] = NULL;
4770 qemu_bh_delete(sq->bh);
4771 if (sq->ioeventfd_enabled) {
4772 memory_region_del_eventfd(&n->iomem,
4773 0x1000 + offset, 4, false, 0, &sq->notifier);
4774 event_notifier_set_handler(&sq->notifier, NULL);
4775 event_notifier_cleanup(&sq->notifier);
4776 }
4777 g_free(sq->io_req);
4778 if (sq->sqid) {
4779 g_free(sq);
4780 }
4781 }
4782
nvme_del_sq(NvmeCtrl * n,NvmeRequest * req)4783 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
4784 {
4785 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4786 NvmeRequest *r, *next;
4787 NvmeSQueue *sq;
4788 NvmeCQueue *cq;
4789 uint16_t qid = le16_to_cpu(c->qid);
4790
4791 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
4792 trace_pci_nvme_err_invalid_del_sq(qid);
4793 return NVME_INVALID_QID | NVME_DNR;
4794 }
4795
4796 trace_pci_nvme_del_sq(qid);
4797
4798 sq = n->sq[qid];
4799 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
4800 r = QTAILQ_FIRST(&sq->out_req_list);
4801 assert(r->aiocb);
4802 blk_aio_cancel(r->aiocb);
4803 }
4804
4805 assert(QTAILQ_EMPTY(&sq->out_req_list));
4806
4807 if (!nvme_check_cqid(n, sq->cqid)) {
4808 cq = n->cq[sq->cqid];
4809 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
4810
4811 nvme_post_cqes(cq);
4812 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
4813 if (r->sq == sq) {
4814 QTAILQ_REMOVE(&cq->req_list, r, entry);
4815 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
4816 }
4817 }
4818 }
4819
4820 nvme_free_sq(sq, n);
4821 return NVME_SUCCESS;
4822 }
4823
nvme_init_sq(NvmeSQueue * sq,NvmeCtrl * n,uint64_t dma_addr,uint16_t sqid,uint16_t cqid,uint16_t size)4824 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
4825 uint16_t sqid, uint16_t cqid, uint16_t size)
4826 {
4827 int i;
4828 NvmeCQueue *cq;
4829
4830 sq->ctrl = n;
4831 sq->dma_addr = dma_addr;
4832 sq->sqid = sqid;
4833 sq->size = size;
4834 sq->cqid = cqid;
4835 sq->head = sq->tail = 0;
4836 sq->io_req = g_new0(NvmeRequest, sq->size);
4837
4838 QTAILQ_INIT(&sq->req_list);
4839 QTAILQ_INIT(&sq->out_req_list);
4840 for (i = 0; i < sq->size; i++) {
4841 sq->io_req[i].sq = sq;
4842 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4843 }
4844
4845 sq->bh = qemu_bh_new_guarded(nvme_process_sq, sq,
4846 &DEVICE(sq->ctrl)->mem_reentrancy_guard);
4847
4848 if (n->dbbuf_enabled) {
4849 sq->db_addr = n->dbbuf_dbs + (sqid << 3);
4850 sq->ei_addr = n->dbbuf_eis + (sqid << 3);
4851
4852 if (n->params.ioeventfd && sq->sqid != 0) {
4853 if (!nvme_init_sq_ioeventfd(sq)) {
4854 sq->ioeventfd_enabled = true;
4855 }
4856 }
4857 }
4858
4859 assert(n->cq[cqid]);
4860 cq = n->cq[cqid];
4861 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4862 n->sq[sqid] = sq;
4863 }
4864
nvme_create_sq(NvmeCtrl * n,NvmeRequest * req)4865 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4866 {
4867 NvmeSQueue *sq;
4868 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4869
4870 uint16_t cqid = le16_to_cpu(c->cqid);
4871 uint16_t sqid = le16_to_cpu(c->sqid);
4872 uint16_t qsize = le16_to_cpu(c->qsize);
4873 uint16_t qflags = le16_to_cpu(c->sq_flags);
4874 uint64_t prp1 = le64_to_cpu(c->prp1);
4875
4876 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4877
4878 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4879 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4880 return NVME_INVALID_CQID | NVME_DNR;
4881 }
4882 if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
4883 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4884 return NVME_INVALID_QID | NVME_DNR;
4885 }
4886 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4887 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4888 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4889 }
4890 if (unlikely(prp1 & (n->page_size - 1))) {
4891 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4892 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4893 }
4894 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4895 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4896 return NVME_INVALID_FIELD | NVME_DNR;
4897 }
4898 sq = g_malloc0(sizeof(*sq));
4899 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4900 return NVME_SUCCESS;
4901 }
4902
4903 struct nvme_stats {
4904 uint64_t units_read;
4905 uint64_t units_written;
4906 uint64_t read_commands;
4907 uint64_t write_commands;
4908 };
4909
nvme_set_blk_stats(NvmeNamespace * ns,struct nvme_stats * stats)4910 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4911 {
4912 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4913
4914 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
4915 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
4916 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4917 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4918 }
4919
nvme_smart_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4920 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4921 uint64_t off, NvmeRequest *req)
4922 {
4923 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4924 struct nvme_stats stats = { 0 };
4925 NvmeSmartLog smart = { 0 };
4926 uint32_t trans_len;
4927 NvmeNamespace *ns;
4928 time_t current_ms;
4929 uint64_t u_read, u_written;
4930
4931 if (off >= sizeof(smart)) {
4932 return NVME_INVALID_FIELD | NVME_DNR;
4933 }
4934
4935 if (nsid != 0xffffffff) {
4936 ns = nvme_ns(n, nsid);
4937 if (!ns) {
4938 return NVME_INVALID_NSID | NVME_DNR;
4939 }
4940 nvme_set_blk_stats(ns, &stats);
4941 } else {
4942 int i;
4943
4944 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4945 ns = nvme_ns(n, i);
4946 if (!ns) {
4947 continue;
4948 }
4949 nvme_set_blk_stats(ns, &stats);
4950 }
4951 }
4952
4953 trans_len = MIN(sizeof(smart) - off, buf_len);
4954 smart.critical_warning = n->smart_critical_warning;
4955
4956 u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
4957 u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
4958
4959 smart.data_units_read[0] = cpu_to_le64(u_read);
4960 smart.data_units_written[0] = cpu_to_le64(u_written);
4961 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4962 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4963
4964 smart.temperature = cpu_to_le16(n->temperature);
4965
4966 if ((n->temperature >= n->features.temp_thresh_hi) ||
4967 (n->temperature <= n->features.temp_thresh_low)) {
4968 smart.critical_warning |= NVME_SMART_TEMPERATURE;
4969 }
4970
4971 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4972 smart.power_on_hours[0] =
4973 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4974
4975 if (!rae) {
4976 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4977 }
4978
4979 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4980 }
4981
nvme_endgrp_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4982 static uint16_t nvme_endgrp_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4983 uint64_t off, NvmeRequest *req)
4984 {
4985 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
4986 uint16_t endgrpid = (dw11 >> 16) & 0xffff;
4987 struct nvme_stats stats = {};
4988 NvmeEndGrpLog info = {};
4989 int i;
4990
4991 if (!n->subsys || endgrpid != 0x1) {
4992 return NVME_INVALID_FIELD | NVME_DNR;
4993 }
4994
4995 if (off >= sizeof(info)) {
4996 return NVME_INVALID_FIELD | NVME_DNR;
4997 }
4998
4999 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5000 NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
5001 if (!ns) {
5002 continue;
5003 }
5004
5005 nvme_set_blk_stats(ns, &stats);
5006 }
5007
5008 info.data_units_read[0] =
5009 cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000));
5010 info.data_units_written[0] =
5011 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
5012 info.media_units_written[0] =
5013 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
5014
5015 info.host_read_commands[0] = cpu_to_le64(stats.read_commands);
5016 info.host_write_commands[0] = cpu_to_le64(stats.write_commands);
5017
5018 buf_len = MIN(sizeof(info) - off, buf_len);
5019
5020 return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req);
5021 }
5022
5023
nvme_fw_log_info(NvmeCtrl * n,uint32_t buf_len,uint64_t off,NvmeRequest * req)5024 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
5025 NvmeRequest *req)
5026 {
5027 uint32_t trans_len;
5028 NvmeFwSlotInfoLog fw_log = {
5029 .afi = 0x1,
5030 };
5031
5032 if (off >= sizeof(fw_log)) {
5033 return NVME_INVALID_FIELD | NVME_DNR;
5034 }
5035
5036 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
5037 trans_len = MIN(sizeof(fw_log) - off, buf_len);
5038
5039 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
5040 }
5041
nvme_error_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5042 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5043 uint64_t off, NvmeRequest *req)
5044 {
5045 uint32_t trans_len;
5046 NvmeErrorLog errlog;
5047
5048 if (off >= sizeof(errlog)) {
5049 return NVME_INVALID_FIELD | NVME_DNR;
5050 }
5051
5052 if (!rae) {
5053 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
5054 }
5055
5056 memset(&errlog, 0x0, sizeof(errlog));
5057 trans_len = MIN(sizeof(errlog) - off, buf_len);
5058
5059 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
5060 }
5061
nvme_changed_nslist(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5062 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5063 uint64_t off, NvmeRequest *req)
5064 {
5065 uint32_t nslist[1024];
5066 uint32_t trans_len;
5067 int i = 0;
5068 uint32_t nsid;
5069
5070 if (off >= sizeof(nslist)) {
5071 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
5072 return NVME_INVALID_FIELD | NVME_DNR;
5073 }
5074
5075 memset(nslist, 0x0, sizeof(nslist));
5076 trans_len = MIN(sizeof(nslist) - off, buf_len);
5077
5078 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
5079 NVME_CHANGED_NSID_SIZE) {
5080 /*
5081 * If more than 1024 namespaces, the first entry in the log page should
5082 * be set to FFFFFFFFh and the others to 0 as spec.
5083 */
5084 if (i == ARRAY_SIZE(nslist)) {
5085 memset(nslist, 0x0, sizeof(nslist));
5086 nslist[0] = 0xffffffff;
5087 break;
5088 }
5089
5090 nslist[i++] = nsid;
5091 clear_bit(nsid, n->changed_nsids);
5092 }
5093
5094 /*
5095 * Remove all the remaining list entries in case returns directly due to
5096 * more than 1024 namespaces.
5097 */
5098 if (nslist[0] == 0xffffffff) {
5099 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
5100 }
5101
5102 if (!rae) {
5103 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
5104 }
5105
5106 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
5107 }
5108
nvme_cmd_effects(NvmeCtrl * n,uint8_t csi,uint32_t buf_len,uint64_t off,NvmeRequest * req)5109 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
5110 uint64_t off, NvmeRequest *req)
5111 {
5112 NvmeEffectsLog log = {};
5113 const uint32_t *src_iocs = NULL;
5114 uint32_t trans_len;
5115
5116 if (off >= sizeof(log)) {
5117 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
5118 return NVME_INVALID_FIELD | NVME_DNR;
5119 }
5120
5121 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
5122 case NVME_CC_CSS_NVM:
5123 src_iocs = nvme_cse_iocs_nvm;
5124 /* fall through */
5125 case NVME_CC_CSS_ADMIN_ONLY:
5126 break;
5127 case NVME_CC_CSS_CSI:
5128 switch (csi) {
5129 case NVME_CSI_NVM:
5130 src_iocs = nvme_cse_iocs_nvm;
5131 break;
5132 case NVME_CSI_ZONED:
5133 src_iocs = nvme_cse_iocs_zoned;
5134 break;
5135 }
5136 }
5137
5138 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
5139
5140 if (src_iocs) {
5141 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
5142 }
5143
5144 trans_len = MIN(sizeof(log) - off, buf_len);
5145
5146 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
5147 }
5148
sizeof_fdp_conf_descr(size_t nruh,size_t vss)5149 static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss)
5150 {
5151 size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr)
5152 + vss;
5153 return ROUND_UP(entry_siz, 8);
5154 }
5155
nvme_fdp_confs(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5156 static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5157 uint64_t off, NvmeRequest *req)
5158 {
5159 uint32_t log_size, trans_len;
5160 g_autofree uint8_t *buf = NULL;
5161 NvmeFdpDescrHdr *hdr;
5162 NvmeRuhDescr *ruhd;
5163 NvmeEnduranceGroup *endgrp;
5164 NvmeFdpConfsHdr *log;
5165 size_t nruh, fdp_descr_size;
5166 int i;
5167
5168 if (endgrpid != 1 || !n->subsys) {
5169 return NVME_INVALID_FIELD | NVME_DNR;
5170 }
5171
5172 endgrp = &n->subsys->endgrp;
5173
5174 if (endgrp->fdp.enabled) {
5175 nruh = endgrp->fdp.nruh;
5176 } else {
5177 nruh = 1;
5178 }
5179
5180 fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS);
5181 log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size;
5182
5183 if (off >= log_size) {
5184 return NVME_INVALID_FIELD | NVME_DNR;
5185 }
5186
5187 trans_len = MIN(log_size - off, buf_len);
5188
5189 buf = g_malloc0(log_size);
5190 log = (NvmeFdpConfsHdr *)buf;
5191 hdr = (NvmeFdpDescrHdr *)(log + 1);
5192 ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr));
5193
5194 log->num_confs = cpu_to_le16(0);
5195 log->size = cpu_to_le32(log_size);
5196
5197 hdr->descr_size = cpu_to_le16(fdp_descr_size);
5198 if (endgrp->fdp.enabled) {
5199 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1);
5200 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif);
5201 hdr->nrg = cpu_to_le16(endgrp->fdp.nrg);
5202 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5203 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5204 hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES);
5205 hdr->runs = cpu_to_le64(endgrp->fdp.runs);
5206
5207 for (i = 0; i < nruh; i++) {
5208 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5209 ruhd++;
5210 }
5211 } else {
5212 /* 1 bit for RUH in PIF -> 2 RUHs max. */
5213 hdr->nrg = cpu_to_le16(1);
5214 hdr->nruh = cpu_to_le16(1);
5215 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5216 hdr->nnss = cpu_to_le32(1);
5217 hdr->runs = cpu_to_le64(96 * MiB);
5218
5219 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5220 }
5221
5222 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5223 }
5224
nvme_fdp_ruh_usage(NvmeCtrl * n,uint32_t endgrpid,uint32_t dw10,uint32_t dw12,uint32_t buf_len,uint64_t off,NvmeRequest * req)5225 static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid,
5226 uint32_t dw10, uint32_t dw12,
5227 uint32_t buf_len, uint64_t off,
5228 NvmeRequest *req)
5229 {
5230 NvmeRuHandle *ruh;
5231 NvmeRuhuLog *hdr;
5232 NvmeRuhuDescr *ruhud;
5233 NvmeEnduranceGroup *endgrp;
5234 g_autofree uint8_t *buf = NULL;
5235 uint32_t log_size, trans_len;
5236 uint16_t i;
5237
5238 if (endgrpid != 1 || !n->subsys) {
5239 return NVME_INVALID_FIELD | NVME_DNR;
5240 }
5241
5242 endgrp = &n->subsys->endgrp;
5243
5244 if (!endgrp->fdp.enabled) {
5245 return NVME_FDP_DISABLED | NVME_DNR;
5246 }
5247
5248 log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr);
5249
5250 if (off >= log_size) {
5251 return NVME_INVALID_FIELD | NVME_DNR;
5252 }
5253
5254 trans_len = MIN(log_size - off, buf_len);
5255
5256 buf = g_malloc0(log_size);
5257 hdr = (NvmeRuhuLog *)buf;
5258 ruhud = (NvmeRuhuDescr *)(hdr + 1);
5259
5260 ruh = endgrp->fdp.ruhs;
5261 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5262
5263 for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) {
5264 ruhud->ruha = ruh->ruha;
5265 }
5266
5267 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5268 }
5269
nvme_fdp_stats(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5270 static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5271 uint64_t off, NvmeRequest *req)
5272 {
5273 NvmeEnduranceGroup *endgrp;
5274 NvmeFdpStatsLog log = {};
5275 uint32_t trans_len;
5276
5277 if (off >= sizeof(NvmeFdpStatsLog)) {
5278 return NVME_INVALID_FIELD | NVME_DNR;
5279 }
5280
5281 if (endgrpid != 1 || !n->subsys) {
5282 return NVME_INVALID_FIELD | NVME_DNR;
5283 }
5284
5285 if (!n->subsys->endgrp.fdp.enabled) {
5286 return NVME_FDP_DISABLED | NVME_DNR;
5287 }
5288
5289 endgrp = &n->subsys->endgrp;
5290
5291 trans_len = MIN(sizeof(log) - off, buf_len);
5292
5293 /* spec value is 128 bit, we only use 64 bit */
5294 log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw);
5295 log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw);
5296 log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe);
5297
5298 return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req);
5299 }
5300
nvme_fdp_events(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5301 static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid,
5302 uint32_t buf_len, uint64_t off,
5303 NvmeRequest *req)
5304 {
5305 NvmeEnduranceGroup *endgrp;
5306 NvmeCmd *cmd = &req->cmd;
5307 bool host_events = (cmd->cdw10 >> 8) & 0x1;
5308 uint32_t log_size, trans_len;
5309 NvmeFdpEventBuffer *ebuf;
5310 g_autofree NvmeFdpEventsLog *elog = NULL;
5311 NvmeFdpEvent *event;
5312
5313 if (endgrpid != 1 || !n->subsys) {
5314 return NVME_INVALID_FIELD | NVME_DNR;
5315 }
5316
5317 endgrp = &n->subsys->endgrp;
5318
5319 if (!endgrp->fdp.enabled) {
5320 return NVME_FDP_DISABLED | NVME_DNR;
5321 }
5322
5323 if (host_events) {
5324 ebuf = &endgrp->fdp.host_events;
5325 } else {
5326 ebuf = &endgrp->fdp.ctrl_events;
5327 }
5328
5329 log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
5330
5331 if (off >= log_size) {
5332 return NVME_INVALID_FIELD | NVME_DNR;
5333 }
5334
5335 trans_len = MIN(log_size - off, buf_len);
5336 elog = g_malloc0(log_size);
5337 elog->num_events = cpu_to_le32(ebuf->nelems);
5338 event = (NvmeFdpEvent *)(elog + 1);
5339
5340 if (ebuf->nelems && ebuf->start == ebuf->next) {
5341 unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start);
5342 /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */
5343 memcpy(event, &ebuf->events[ebuf->start],
5344 sizeof(NvmeFdpEvent) * nelems);
5345 memcpy(event + nelems, ebuf->events,
5346 sizeof(NvmeFdpEvent) * ebuf->next);
5347 } else if (ebuf->start < ebuf->next) {
5348 memcpy(event, &ebuf->events[ebuf->start],
5349 sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start));
5350 }
5351
5352 return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req);
5353 }
5354
nvme_get_log(NvmeCtrl * n,NvmeRequest * req)5355 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
5356 {
5357 NvmeCmd *cmd = &req->cmd;
5358
5359 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5360 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5361 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
5362 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
5363 uint8_t lid = dw10 & 0xff;
5364 uint8_t lsp = (dw10 >> 8) & 0xf;
5365 uint8_t rae = (dw10 >> 15) & 0x1;
5366 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
5367 uint32_t numdl, numdu, lspi;
5368 uint64_t off, lpol, lpou;
5369 size_t len;
5370 uint16_t status;
5371
5372 numdl = (dw10 >> 16);
5373 numdu = (dw11 & 0xffff);
5374 lspi = (dw11 >> 16);
5375 lpol = dw12;
5376 lpou = dw13;
5377
5378 len = (((numdu << 16) | numdl) + 1) << 2;
5379 off = (lpou << 32ULL) | lpol;
5380
5381 if (off & 0x3) {
5382 return NVME_INVALID_FIELD | NVME_DNR;
5383 }
5384
5385 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
5386
5387 status = nvme_check_mdts(n, len);
5388 if (status) {
5389 return status;
5390 }
5391
5392 switch (lid) {
5393 case NVME_LOG_ERROR_INFO:
5394 return nvme_error_info(n, rae, len, off, req);
5395 case NVME_LOG_SMART_INFO:
5396 return nvme_smart_info(n, rae, len, off, req);
5397 case NVME_LOG_FW_SLOT_INFO:
5398 return nvme_fw_log_info(n, len, off, req);
5399 case NVME_LOG_CHANGED_NSLIST:
5400 return nvme_changed_nslist(n, rae, len, off, req);
5401 case NVME_LOG_CMD_EFFECTS:
5402 return nvme_cmd_effects(n, csi, len, off, req);
5403 case NVME_LOG_ENDGRP:
5404 return nvme_endgrp_info(n, rae, len, off, req);
5405 case NVME_LOG_FDP_CONFS:
5406 return nvme_fdp_confs(n, lspi, len, off, req);
5407 case NVME_LOG_FDP_RUH_USAGE:
5408 return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req);
5409 case NVME_LOG_FDP_STATS:
5410 return nvme_fdp_stats(n, lspi, len, off, req);
5411 case NVME_LOG_FDP_EVENTS:
5412 return nvme_fdp_events(n, lspi, len, off, req);
5413 default:
5414 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
5415 return NVME_INVALID_FIELD | NVME_DNR;
5416 }
5417 }
5418
nvme_free_cq(NvmeCQueue * cq,NvmeCtrl * n)5419 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
5420 {
5421 PCIDevice *pci = PCI_DEVICE(n);
5422 uint16_t offset = (cq->cqid << 3) + (1 << 2);
5423
5424 n->cq[cq->cqid] = NULL;
5425 qemu_bh_delete(cq->bh);
5426 if (cq->ioeventfd_enabled) {
5427 memory_region_del_eventfd(&n->iomem,
5428 0x1000 + offset, 4, false, 0, &cq->notifier);
5429 event_notifier_set_handler(&cq->notifier, NULL);
5430 event_notifier_cleanup(&cq->notifier);
5431 }
5432 if (msix_enabled(pci) && cq->irq_enabled) {
5433 msix_vector_unuse(pci, cq->vector);
5434 }
5435 if (cq->cqid) {
5436 g_free(cq);
5437 }
5438 }
5439
nvme_del_cq(NvmeCtrl * n,NvmeRequest * req)5440 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
5441 {
5442 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
5443 NvmeCQueue *cq;
5444 uint16_t qid = le16_to_cpu(c->qid);
5445
5446 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
5447 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
5448 return NVME_INVALID_CQID | NVME_DNR;
5449 }
5450
5451 cq = n->cq[qid];
5452 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
5453 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
5454 return NVME_INVALID_QUEUE_DEL;
5455 }
5456
5457 if (cq->irq_enabled && cq->tail != cq->head) {
5458 n->cq_pending--;
5459 }
5460
5461 nvme_irq_deassert(n, cq);
5462 trace_pci_nvme_del_cq(qid);
5463 nvme_free_cq(cq, n);
5464 return NVME_SUCCESS;
5465 }
5466
nvme_init_cq(NvmeCQueue * cq,NvmeCtrl * n,uint64_t dma_addr,uint16_t cqid,uint16_t vector,uint16_t size,uint16_t irq_enabled)5467 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
5468 uint16_t cqid, uint16_t vector, uint16_t size,
5469 uint16_t irq_enabled)
5470 {
5471 PCIDevice *pci = PCI_DEVICE(n);
5472
5473 if (msix_enabled(pci) && irq_enabled) {
5474 msix_vector_use(pci, vector);
5475 }
5476
5477 cq->ctrl = n;
5478 cq->cqid = cqid;
5479 cq->size = size;
5480 cq->dma_addr = dma_addr;
5481 cq->phase = 1;
5482 cq->irq_enabled = irq_enabled;
5483 cq->vector = vector;
5484 cq->head = cq->tail = 0;
5485 QTAILQ_INIT(&cq->req_list);
5486 QTAILQ_INIT(&cq->sq_list);
5487 if (n->dbbuf_enabled) {
5488 cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
5489 cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
5490
5491 if (n->params.ioeventfd && cqid != 0) {
5492 if (!nvme_init_cq_ioeventfd(cq)) {
5493 cq->ioeventfd_enabled = true;
5494 }
5495 }
5496 }
5497 n->cq[cqid] = cq;
5498 cq->bh = qemu_bh_new_guarded(nvme_post_cqes, cq,
5499 &DEVICE(cq->ctrl)->mem_reentrancy_guard);
5500 }
5501
nvme_create_cq(NvmeCtrl * n,NvmeRequest * req)5502 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
5503 {
5504 NvmeCQueue *cq;
5505 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
5506 uint16_t cqid = le16_to_cpu(c->cqid);
5507 uint16_t vector = le16_to_cpu(c->irq_vector);
5508 uint16_t qsize = le16_to_cpu(c->qsize);
5509 uint16_t qflags = le16_to_cpu(c->cq_flags);
5510 uint64_t prp1 = le64_to_cpu(c->prp1);
5511 uint32_t cc = ldq_le_p(&n->bar.cc);
5512 uint8_t iocqes = NVME_CC_IOCQES(cc);
5513 uint8_t iosqes = NVME_CC_IOSQES(cc);
5514
5515 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
5516 NVME_CQ_FLAGS_IEN(qflags) != 0);
5517
5518 if (iosqes != NVME_SQES || iocqes != NVME_CQES) {
5519 trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes);
5520 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5521 }
5522
5523 if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
5524 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
5525 return NVME_INVALID_QID | NVME_DNR;
5526 }
5527 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
5528 trace_pci_nvme_err_invalid_create_cq_size(qsize);
5529 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5530 }
5531 if (unlikely(prp1 & (n->page_size - 1))) {
5532 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
5533 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
5534 }
5535 if (unlikely(!msix_enabled(PCI_DEVICE(n)) && vector)) {
5536 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5537 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5538 }
5539 if (unlikely(vector >= n->conf_msix_qsize)) {
5540 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5541 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5542 }
5543 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
5544 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
5545 return NVME_INVALID_FIELD | NVME_DNR;
5546 }
5547
5548 cq = g_malloc0(sizeof(*cq));
5549 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
5550 NVME_CQ_FLAGS_IEN(qflags));
5551
5552 /*
5553 * It is only required to set qs_created when creating a completion queue;
5554 * creating a submission queue without a matching completion queue will
5555 * fail.
5556 */
5557 n->qs_created = true;
5558 return NVME_SUCCESS;
5559 }
5560
nvme_rpt_empty_id_struct(NvmeCtrl * n,NvmeRequest * req)5561 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
5562 {
5563 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5564
5565 return nvme_c2h(n, id, sizeof(id), req);
5566 }
5567
nvme_identify_ctrl(NvmeCtrl * n,NvmeRequest * req)5568 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
5569 {
5570 trace_pci_nvme_identify_ctrl();
5571
5572 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
5573 }
5574
nvme_identify_ctrl_csi(NvmeCtrl * n,NvmeRequest * req)5575 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
5576 {
5577 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5578 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5579 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
5580
5581 trace_pci_nvme_identify_ctrl_csi(c->csi);
5582
5583 switch (c->csi) {
5584 case NVME_CSI_NVM:
5585 id_nvm->vsl = n->params.vsl;
5586 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
5587 break;
5588
5589 case NVME_CSI_ZONED:
5590 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
5591 break;
5592
5593 default:
5594 return NVME_INVALID_FIELD | NVME_DNR;
5595 }
5596
5597 return nvme_c2h(n, id, sizeof(id), req);
5598 }
5599
nvme_identify_ns(NvmeCtrl * n,NvmeRequest * req,bool active)5600 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
5601 {
5602 NvmeNamespace *ns;
5603 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5604 uint32_t nsid = le32_to_cpu(c->nsid);
5605
5606 trace_pci_nvme_identify_ns(nsid);
5607
5608 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5609 return NVME_INVALID_NSID | NVME_DNR;
5610 }
5611
5612 ns = nvme_ns(n, nsid);
5613 if (unlikely(!ns)) {
5614 if (!active) {
5615 ns = nvme_subsys_ns(n->subsys, nsid);
5616 if (!ns) {
5617 return nvme_rpt_empty_id_struct(n, req);
5618 }
5619 } else {
5620 return nvme_rpt_empty_id_struct(n, req);
5621 }
5622 }
5623
5624 if (active || ns->csi == NVME_CSI_NVM) {
5625 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
5626 }
5627
5628 return NVME_INVALID_CMD_SET | NVME_DNR;
5629 }
5630
nvme_identify_ctrl_list(NvmeCtrl * n,NvmeRequest * req,bool attached)5631 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
5632 bool attached)
5633 {
5634 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5635 uint32_t nsid = le32_to_cpu(c->nsid);
5636 uint16_t min_id = le16_to_cpu(c->ctrlid);
5637 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5638 uint16_t *ids = &list[1];
5639 NvmeNamespace *ns;
5640 NvmeCtrl *ctrl;
5641 int cntlid, nr_ids = 0;
5642
5643 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
5644
5645 if (!n->subsys) {
5646 return NVME_INVALID_FIELD | NVME_DNR;
5647 }
5648
5649 if (attached) {
5650 if (nsid == NVME_NSID_BROADCAST) {
5651 return NVME_INVALID_FIELD | NVME_DNR;
5652 }
5653
5654 ns = nvme_subsys_ns(n->subsys, nsid);
5655 if (!ns) {
5656 return NVME_INVALID_FIELD | NVME_DNR;
5657 }
5658 }
5659
5660 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
5661 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
5662 if (!ctrl) {
5663 continue;
5664 }
5665
5666 if (attached && !nvme_ns(ctrl, nsid)) {
5667 continue;
5668 }
5669
5670 ids[nr_ids++] = cntlid;
5671 }
5672
5673 list[0] = nr_ids;
5674
5675 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
5676 }
5677
nvme_identify_pri_ctrl_cap(NvmeCtrl * n,NvmeRequest * req)5678 static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
5679 {
5680 trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
5681
5682 return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap,
5683 sizeof(NvmePriCtrlCap), req);
5684 }
5685
nvme_identify_sec_ctrl_list(NvmeCtrl * n,NvmeRequest * req)5686 static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
5687 {
5688 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5689 uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
5690 uint16_t min_id = le16_to_cpu(c->ctrlid);
5691 uint8_t num_sec_ctrl = n->nr_sec_ctrls;
5692 NvmeSecCtrlList list = {0};
5693 uint8_t i;
5694
5695 for (i = 0; i < num_sec_ctrl; i++) {
5696 if (n->sec_ctrl_list[i].scid >= min_id) {
5697 list.numcntl = MIN(num_sec_ctrl - i, 127);
5698 memcpy(&list.sec, n->sec_ctrl_list + i,
5699 list.numcntl * sizeof(NvmeSecCtrlEntry));
5700 break;
5701 }
5702 }
5703
5704 trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
5705
5706 return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req);
5707 }
5708
nvme_identify_ns_ind(NvmeCtrl * n,NvmeRequest * req,bool alloc)5709 static uint16_t nvme_identify_ns_ind(NvmeCtrl *n, NvmeRequest *req, bool alloc)
5710 {
5711 NvmeNamespace *ns;
5712 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5713 uint32_t nsid = le32_to_cpu(c->nsid);
5714
5715 trace_pci_nvme_identify_ns_ind(nsid);
5716
5717 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5718 return NVME_INVALID_NSID | NVME_DNR;
5719 }
5720
5721 ns = nvme_ns(n, nsid);
5722 if (unlikely(!ns)) {
5723 if (alloc) {
5724 ns = nvme_subsys_ns(n->subsys, nsid);
5725 if (!ns) {
5726 return nvme_rpt_empty_id_struct(n, req);
5727 }
5728 } else {
5729 return nvme_rpt_empty_id_struct(n, req);
5730 }
5731 }
5732
5733 return nvme_c2h(n, (uint8_t *)&ns->id_ns_ind, sizeof(NvmeIdNsInd), req);
5734 }
5735
nvme_identify_ns_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5736 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
5737 bool active)
5738 {
5739 NvmeNamespace *ns;
5740 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5741 uint32_t nsid = le32_to_cpu(c->nsid);
5742
5743 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
5744
5745 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5746 return NVME_INVALID_NSID | NVME_DNR;
5747 }
5748
5749 ns = nvme_ns(n, nsid);
5750 if (unlikely(!ns)) {
5751 if (!active) {
5752 ns = nvme_subsys_ns(n->subsys, nsid);
5753 if (!ns) {
5754 return nvme_rpt_empty_id_struct(n, req);
5755 }
5756 } else {
5757 return nvme_rpt_empty_id_struct(n, req);
5758 }
5759 }
5760
5761 if (c->csi == NVME_CSI_NVM) {
5762 return nvme_c2h(n, (uint8_t *)&ns->id_ns_nvm, sizeof(NvmeIdNsNvm),
5763 req);
5764 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
5765 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
5766 req);
5767 }
5768
5769 return NVME_INVALID_FIELD | NVME_DNR;
5770 }
5771
nvme_identify_nslist(NvmeCtrl * n,NvmeRequest * req,bool active)5772 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
5773 bool active)
5774 {
5775 NvmeNamespace *ns;
5776 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5777 uint32_t min_nsid = le32_to_cpu(c->nsid);
5778 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5779 static const int data_len = sizeof(list);
5780 uint32_t *list_ptr = (uint32_t *)list;
5781 int i, j = 0;
5782
5783 trace_pci_nvme_identify_nslist(min_nsid);
5784
5785 /*
5786 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
5787 * since the Active Namespace ID List should return namespaces with ids
5788 * *higher* than the NSID specified in the command. This is also specified
5789 * in the spec (NVM Express v1.3d, Section 5.15.4).
5790 */
5791 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5792 return NVME_INVALID_NSID | NVME_DNR;
5793 }
5794
5795 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5796 ns = nvme_ns(n, i);
5797 if (!ns) {
5798 if (!active) {
5799 ns = nvme_subsys_ns(n->subsys, i);
5800 if (!ns) {
5801 continue;
5802 }
5803 } else {
5804 continue;
5805 }
5806 }
5807 if (ns->params.nsid <= min_nsid) {
5808 continue;
5809 }
5810 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5811 if (j == data_len / sizeof(uint32_t)) {
5812 break;
5813 }
5814 }
5815
5816 return nvme_c2h(n, list, data_len, req);
5817 }
5818
nvme_identify_nslist_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5819 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
5820 bool active)
5821 {
5822 NvmeNamespace *ns;
5823 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5824 uint32_t min_nsid = le32_to_cpu(c->nsid);
5825 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5826 static const int data_len = sizeof(list);
5827 uint32_t *list_ptr = (uint32_t *)list;
5828 int i, j = 0;
5829
5830 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
5831
5832 /*
5833 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
5834 */
5835 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5836 return NVME_INVALID_NSID | NVME_DNR;
5837 }
5838
5839 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
5840 return NVME_INVALID_FIELD | NVME_DNR;
5841 }
5842
5843 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5844 ns = nvme_ns(n, i);
5845 if (!ns) {
5846 if (!active) {
5847 ns = nvme_subsys_ns(n->subsys, i);
5848 if (!ns) {
5849 continue;
5850 }
5851 } else {
5852 continue;
5853 }
5854 }
5855 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
5856 continue;
5857 }
5858 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5859 if (j == data_len / sizeof(uint32_t)) {
5860 break;
5861 }
5862 }
5863
5864 return nvme_c2h(n, list, data_len, req);
5865 }
5866
nvme_endurance_group_list(NvmeCtrl * n,NvmeRequest * req)5867 static uint16_t nvme_endurance_group_list(NvmeCtrl *n, NvmeRequest *req)
5868 {
5869 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5870 uint16_t *nr_ids = &list[0];
5871 uint16_t *ids = &list[1];
5872 uint16_t endgid = le32_to_cpu(req->cmd.cdw11) & 0xffff;
5873
5874 /*
5875 * The current nvme-subsys only supports Endurance Group #1.
5876 */
5877 if (!endgid) {
5878 *nr_ids = 1;
5879 ids[0] = 1;
5880 } else {
5881 *nr_ids = 0;
5882 }
5883
5884 return nvme_c2h(n, list, sizeof(list), req);
5885 }
5886
nvme_identify_ns_descr_list(NvmeCtrl * n,NvmeRequest * req)5887 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
5888 {
5889 NvmeNamespace *ns;
5890 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5891 uint32_t nsid = le32_to_cpu(c->nsid);
5892 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5893 uint8_t *pos = list;
5894 struct {
5895 NvmeIdNsDescr hdr;
5896 uint8_t v[NVME_NIDL_UUID];
5897 } QEMU_PACKED uuid = {};
5898 struct {
5899 NvmeIdNsDescr hdr;
5900 uint8_t v[NVME_NIDL_NGUID];
5901 } QEMU_PACKED nguid = {};
5902 struct {
5903 NvmeIdNsDescr hdr;
5904 uint64_t v;
5905 } QEMU_PACKED eui64 = {};
5906 struct {
5907 NvmeIdNsDescr hdr;
5908 uint8_t v;
5909 } QEMU_PACKED csi = {};
5910
5911 trace_pci_nvme_identify_ns_descr_list(nsid);
5912
5913 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5914 return NVME_INVALID_NSID | NVME_DNR;
5915 }
5916
5917 ns = nvme_ns(n, nsid);
5918 if (unlikely(!ns)) {
5919 return NVME_INVALID_FIELD | NVME_DNR;
5920 }
5921
5922 if (!qemu_uuid_is_null(&ns->params.uuid)) {
5923 uuid.hdr.nidt = NVME_NIDT_UUID;
5924 uuid.hdr.nidl = NVME_NIDL_UUID;
5925 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
5926 memcpy(pos, &uuid, sizeof(uuid));
5927 pos += sizeof(uuid);
5928 }
5929
5930 if (!nvme_nguid_is_null(&ns->params.nguid)) {
5931 nguid.hdr.nidt = NVME_NIDT_NGUID;
5932 nguid.hdr.nidl = NVME_NIDL_NGUID;
5933 memcpy(nguid.v, ns->params.nguid.data, NVME_NIDL_NGUID);
5934 memcpy(pos, &nguid, sizeof(nguid));
5935 pos += sizeof(nguid);
5936 }
5937
5938 if (ns->params.eui64) {
5939 eui64.hdr.nidt = NVME_NIDT_EUI64;
5940 eui64.hdr.nidl = NVME_NIDL_EUI64;
5941 eui64.v = cpu_to_be64(ns->params.eui64);
5942 memcpy(pos, &eui64, sizeof(eui64));
5943 pos += sizeof(eui64);
5944 }
5945
5946 csi.hdr.nidt = NVME_NIDT_CSI;
5947 csi.hdr.nidl = NVME_NIDL_CSI;
5948 csi.v = ns->csi;
5949 memcpy(pos, &csi, sizeof(csi));
5950 pos += sizeof(csi);
5951
5952 return nvme_c2h(n, list, sizeof(list), req);
5953 }
5954
nvme_identify_cmd_set(NvmeCtrl * n,NvmeRequest * req)5955 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
5956 {
5957 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5958 static const int data_len = sizeof(list);
5959
5960 trace_pci_nvme_identify_cmd_set();
5961
5962 NVME_SET_CSI(*list, NVME_CSI_NVM);
5963 NVME_SET_CSI(*list, NVME_CSI_ZONED);
5964
5965 return nvme_c2h(n, list, data_len, req);
5966 }
5967
nvme_identify(NvmeCtrl * n,NvmeRequest * req)5968 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
5969 {
5970 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5971
5972 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
5973 c->csi);
5974
5975 switch (c->cns) {
5976 case NVME_ID_CNS_NS:
5977 return nvme_identify_ns(n, req, true);
5978 case NVME_ID_CNS_NS_PRESENT:
5979 return nvme_identify_ns(n, req, false);
5980 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
5981 return nvme_identify_ctrl_list(n, req, true);
5982 case NVME_ID_CNS_CTRL_LIST:
5983 return nvme_identify_ctrl_list(n, req, false);
5984 case NVME_ID_CNS_PRIMARY_CTRL_CAP:
5985 return nvme_identify_pri_ctrl_cap(n, req);
5986 case NVME_ID_CNS_SECONDARY_CTRL_LIST:
5987 return nvme_identify_sec_ctrl_list(n, req);
5988 case NVME_ID_CNS_CS_NS:
5989 return nvme_identify_ns_csi(n, req, true);
5990 case NVME_ID_CNS_CS_IND_NS:
5991 return nvme_identify_ns_ind(n, req, false);
5992 case NVME_ID_CNS_CS_IND_NS_ALLOCATED:
5993 return nvme_identify_ns_ind(n, req, true);
5994 case NVME_ID_CNS_CS_NS_PRESENT:
5995 return nvme_identify_ns_csi(n, req, false);
5996 case NVME_ID_CNS_CTRL:
5997 return nvme_identify_ctrl(n, req);
5998 case NVME_ID_CNS_CS_CTRL:
5999 return nvme_identify_ctrl_csi(n, req);
6000 case NVME_ID_CNS_NS_ACTIVE_LIST:
6001 return nvme_identify_nslist(n, req, true);
6002 case NVME_ID_CNS_NS_PRESENT_LIST:
6003 return nvme_identify_nslist(n, req, false);
6004 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
6005 return nvme_identify_nslist_csi(n, req, true);
6006 case NVME_ID_CNS_ENDURANCE_GROUP_LIST:
6007 return nvme_endurance_group_list(n, req);
6008 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
6009 return nvme_identify_nslist_csi(n, req, false);
6010 case NVME_ID_CNS_NS_DESCR_LIST:
6011 return nvme_identify_ns_descr_list(n, req);
6012 case NVME_ID_CNS_IO_COMMAND_SET:
6013 return nvme_identify_cmd_set(n, req);
6014 default:
6015 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
6016 return NVME_INVALID_FIELD | NVME_DNR;
6017 }
6018 }
6019
nvme_abort(NvmeCtrl * n,NvmeRequest * req)6020 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
6021 {
6022 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
6023 uint16_t cid = (le32_to_cpu(req->cmd.cdw10) >> 16) & 0xffff;
6024 NvmeSQueue *sq = n->sq[sqid];
6025 NvmeRequest *r, *next;
6026 int i;
6027
6028 req->cqe.result = 1;
6029 if (nvme_check_sqid(n, sqid)) {
6030 return NVME_INVALID_FIELD | NVME_DNR;
6031 }
6032
6033 if (sqid == 0) {
6034 for (i = 0; i < n->outstanding_aers; i++) {
6035 NvmeRequest *re = n->aer_reqs[i];
6036 if (re->cqe.cid == cid) {
6037 memmove(n->aer_reqs + i, n->aer_reqs + i + 1,
6038 (n->outstanding_aers - i - 1) * sizeof(NvmeRequest *));
6039 n->outstanding_aers--;
6040 re->status = NVME_CMD_ABORT_REQ;
6041 req->cqe.result = 0;
6042 nvme_enqueue_req_completion(&n->admin_cq, re);
6043 return NVME_SUCCESS;
6044 }
6045 }
6046 }
6047
6048 QTAILQ_FOREACH_SAFE(r, &sq->out_req_list, entry, next) {
6049 if (r->cqe.cid == cid) {
6050 if (r->aiocb) {
6051 blk_aio_cancel_async(r->aiocb);
6052 }
6053 break;
6054 }
6055 }
6056
6057 return NVME_SUCCESS;
6058 }
6059
nvme_set_timestamp(NvmeCtrl * n,uint64_t ts)6060 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
6061 {
6062 trace_pci_nvme_setfeat_timestamp(ts);
6063
6064 n->host_timestamp = le64_to_cpu(ts);
6065 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6066 }
6067
nvme_get_timestamp(const NvmeCtrl * n)6068 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
6069 {
6070 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6071 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
6072
6073 union nvme_timestamp {
6074 struct {
6075 uint64_t timestamp:48;
6076 uint64_t sync:1;
6077 uint64_t origin:3;
6078 uint64_t rsvd1:12;
6079 };
6080 uint64_t all;
6081 };
6082
6083 union nvme_timestamp ts;
6084 ts.all = 0;
6085 ts.timestamp = n->host_timestamp + elapsed_time;
6086
6087 /* If the host timestamp is non-zero, set the timestamp origin */
6088 ts.origin = n->host_timestamp ? 0x01 : 0x00;
6089
6090 trace_pci_nvme_getfeat_timestamp(ts.all);
6091
6092 return cpu_to_le64(ts.all);
6093 }
6094
nvme_get_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6095 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6096 {
6097 uint64_t timestamp = nvme_get_timestamp(n);
6098
6099 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req);
6100 }
6101
nvme_get_feature_fdp(NvmeCtrl * n,uint32_t endgrpid,uint32_t * result)6102 static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid,
6103 uint32_t *result)
6104 {
6105 *result = 0;
6106
6107 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6108 return NVME_INVALID_FIELD | NVME_DNR;
6109 }
6110
6111 *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1);
6112 *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0);
6113
6114 return NVME_SUCCESS;
6115 }
6116
nvme_get_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req,uint32_t * result)6117 static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6118 NvmeRequest *req, uint32_t *result)
6119 {
6120 NvmeCmd *cmd = &req->cmd;
6121 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6122 uint16_t ph = cdw11 & 0xffff;
6123 uint8_t noet = (cdw11 >> 16) & 0xff;
6124 uint16_t ruhid, ret;
6125 uint32_t nentries = 0;
6126 uint8_t s_events_ndx = 0;
6127 size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet;
6128 g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz);
6129 NvmeRuHandle *ruh;
6130 NvmeFdpEventDescr *s_event;
6131
6132 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6133 return NVME_FDP_DISABLED | NVME_DNR;
6134 }
6135
6136 if (!nvme_ph_valid(ns, ph)) {
6137 return NVME_INVALID_FIELD | NVME_DNR;
6138 }
6139
6140 ruhid = ns->fdp.phs[ph];
6141 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6142
6143 assert(ruh);
6144
6145 if (unlikely(noet == 0)) {
6146 return NVME_INVALID_FIELD | NVME_DNR;
6147 }
6148
6149 for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) {
6150 uint8_t shift = nvme_fdp_evf_shifts[event_type];
6151 if (!shift && event_type) {
6152 /*
6153 * only first entry (event_type == 0) has a shift value of 0
6154 * other entries are simply unpopulated.
6155 */
6156 continue;
6157 }
6158
6159 nentries++;
6160
6161 s_event = &s_events[s_events_ndx];
6162 s_event->evt = event_type;
6163 s_event->evta = (ruh->event_filter >> shift) & 0x1;
6164
6165 /* break if all `noet` entries are filled */
6166 if ((++s_events_ndx) == noet) {
6167 break;
6168 }
6169 }
6170
6171 ret = nvme_c2h(n, s_events, s_events_siz, req);
6172 if (ret) {
6173 return ret;
6174 }
6175
6176 *result = nentries;
6177 return NVME_SUCCESS;
6178 }
6179
nvme_get_feature(NvmeCtrl * n,NvmeRequest * req)6180 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
6181 {
6182 NvmeCmd *cmd = &req->cmd;
6183 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6184 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6185 uint32_t nsid = le32_to_cpu(cmd->nsid);
6186 uint32_t result = 0;
6187 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6188 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
6189 uint16_t iv;
6190 NvmeNamespace *ns;
6191 int i;
6192 uint16_t endgrpid = 0, ret = NVME_SUCCESS;
6193
6194 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
6195 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
6196 };
6197
6198 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
6199
6200 if (!nvme_feature_support[fid]) {
6201 return NVME_INVALID_FIELD | NVME_DNR;
6202 }
6203
6204 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6205 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
6206 /*
6207 * The Reservation Notification Mask and Reservation Persistence
6208 * features require a status code of Invalid Field in Command when
6209 * NSID is FFFFFFFFh. Since the device does not support those
6210 * features we can always return Invalid Namespace or Format as we
6211 * should do for all other features.
6212 */
6213 return NVME_INVALID_NSID | NVME_DNR;
6214 }
6215
6216 if (!nvme_ns(n, nsid)) {
6217 return NVME_INVALID_FIELD | NVME_DNR;
6218 }
6219 }
6220
6221 switch (sel) {
6222 case NVME_GETFEAT_SELECT_CURRENT:
6223 break;
6224 case NVME_GETFEAT_SELECT_SAVED:
6225 /* no features are saveable by the controller; fallthrough */
6226 case NVME_GETFEAT_SELECT_DEFAULT:
6227 goto defaults;
6228 case NVME_GETFEAT_SELECT_CAP:
6229 result = nvme_feature_cap[fid];
6230 goto out;
6231 }
6232
6233 switch (fid) {
6234 case NVME_TEMPERATURE_THRESHOLD:
6235 result = 0;
6236
6237 /*
6238 * The controller only implements the Composite Temperature sensor, so
6239 * return 0 for all other sensors.
6240 */
6241 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6242 goto out;
6243 }
6244
6245 switch (NVME_TEMP_THSEL(dw11)) {
6246 case NVME_TEMP_THSEL_OVER:
6247 result = n->features.temp_thresh_hi;
6248 goto out;
6249 case NVME_TEMP_THSEL_UNDER:
6250 result = n->features.temp_thresh_low;
6251 goto out;
6252 }
6253
6254 return NVME_INVALID_FIELD | NVME_DNR;
6255 case NVME_ERROR_RECOVERY:
6256 if (!nvme_nsid_valid(n, nsid)) {
6257 return NVME_INVALID_NSID | NVME_DNR;
6258 }
6259
6260 ns = nvme_ns(n, nsid);
6261 if (unlikely(!ns)) {
6262 return NVME_INVALID_FIELD | NVME_DNR;
6263 }
6264
6265 result = ns->features.err_rec;
6266 goto out;
6267 case NVME_VOLATILE_WRITE_CACHE:
6268 result = 0;
6269 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6270 ns = nvme_ns(n, i);
6271 if (!ns) {
6272 continue;
6273 }
6274
6275 result = blk_enable_write_cache(ns->blkconf.blk);
6276 if (result) {
6277 break;
6278 }
6279 }
6280 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
6281 goto out;
6282 case NVME_ASYNCHRONOUS_EVENT_CONF:
6283 result = n->features.async_config;
6284 goto out;
6285 case NVME_TIMESTAMP:
6286 return nvme_get_feature_timestamp(n, req);
6287 case NVME_HOST_BEHAVIOR_SUPPORT:
6288 return nvme_c2h(n, (uint8_t *)&n->features.hbs,
6289 sizeof(n->features.hbs), req);
6290 case NVME_FDP_MODE:
6291 endgrpid = dw11 & 0xff;
6292
6293 if (endgrpid != 0x1) {
6294 return NVME_INVALID_FIELD | NVME_DNR;
6295 }
6296
6297 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6298 if (ret) {
6299 return ret;
6300 }
6301 goto out;
6302 case NVME_FDP_EVENTS:
6303 if (!nvme_nsid_valid(n, nsid)) {
6304 return NVME_INVALID_NSID | NVME_DNR;
6305 }
6306
6307 ns = nvme_ns(n, nsid);
6308 if (unlikely(!ns)) {
6309 return NVME_INVALID_FIELD | NVME_DNR;
6310 }
6311
6312 ret = nvme_get_feature_fdp_events(n, ns, req, &result);
6313 if (ret) {
6314 return ret;
6315 }
6316 goto out;
6317 default:
6318 break;
6319 }
6320
6321 defaults:
6322 switch (fid) {
6323 case NVME_TEMPERATURE_THRESHOLD:
6324 result = 0;
6325
6326 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6327 break;
6328 }
6329
6330 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
6331 result = NVME_TEMPERATURE_WARNING;
6332 }
6333
6334 break;
6335 case NVME_NUMBER_OF_QUEUES:
6336 result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
6337 trace_pci_nvme_getfeat_numq(result);
6338 break;
6339 case NVME_INTERRUPT_VECTOR_CONF:
6340 iv = dw11 & 0xffff;
6341 if (iv >= n->conf_ioqpairs + 1) {
6342 return NVME_INVALID_FIELD | NVME_DNR;
6343 }
6344
6345 result = iv;
6346 if (iv == n->admin_cq.vector) {
6347 result |= NVME_INTVC_NOCOALESCING;
6348 }
6349 break;
6350 case NVME_FDP_MODE:
6351 endgrpid = dw11 & 0xff;
6352
6353 if (endgrpid != 0x1) {
6354 return NVME_INVALID_FIELD | NVME_DNR;
6355 }
6356
6357 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6358 if (ret) {
6359 return ret;
6360 }
6361 break;
6362
6363 case NVME_WRITE_ATOMICITY:
6364 result = n->dn;
6365 break;
6366 default:
6367 result = nvme_feature_default[fid];
6368 break;
6369 }
6370
6371 out:
6372 req->cqe.result = cpu_to_le32(result);
6373 return ret;
6374 }
6375
nvme_set_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6376 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6377 {
6378 uint16_t ret;
6379 uint64_t timestamp;
6380
6381 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req);
6382 if (ret) {
6383 return ret;
6384 }
6385
6386 nvme_set_timestamp(n, timestamp);
6387
6388 return NVME_SUCCESS;
6389 }
6390
nvme_set_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req)6391 static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6392 NvmeRequest *req)
6393 {
6394 NvmeCmd *cmd = &req->cmd;
6395 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6396 uint16_t ph = cdw11 & 0xffff;
6397 uint8_t noet = (cdw11 >> 16) & 0xff;
6398 uint16_t ret, ruhid;
6399 uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1;
6400 uint8_t event_mask = 0;
6401 unsigned int i;
6402 g_autofree uint8_t *events = g_malloc0(noet);
6403 NvmeRuHandle *ruh = NULL;
6404
6405 assert(ns);
6406
6407 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6408 return NVME_FDP_DISABLED | NVME_DNR;
6409 }
6410
6411 if (!nvme_ph_valid(ns, ph)) {
6412 return NVME_INVALID_FIELD | NVME_DNR;
6413 }
6414
6415 ruhid = ns->fdp.phs[ph];
6416 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6417
6418 ret = nvme_h2c(n, events, noet, req);
6419 if (ret) {
6420 return ret;
6421 }
6422
6423 for (i = 0; i < noet; i++) {
6424 event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]);
6425 }
6426
6427 if (enable) {
6428 ruh->event_filter |= event_mask;
6429 } else {
6430 ruh->event_filter = ruh->event_filter & ~event_mask;
6431 }
6432
6433 return NVME_SUCCESS;
6434 }
6435
nvme_set_feature(NvmeCtrl * n,NvmeRequest * req)6436 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
6437 {
6438 NvmeNamespace *ns = NULL;
6439
6440 NvmeCmd *cmd = &req->cmd;
6441 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6442 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6443 uint32_t nsid = le32_to_cpu(cmd->nsid);
6444 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6445 uint8_t save = NVME_SETFEAT_SAVE(dw10);
6446 uint16_t status;
6447 int i;
6448 NvmeIdCtrl *id = &n->id_ctrl;
6449 NvmeAtomic *atomic = &n->atomic;
6450
6451 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
6452
6453 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
6454 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
6455 }
6456
6457 if (!nvme_feature_support[fid]) {
6458 return NVME_INVALID_FIELD | NVME_DNR;
6459 }
6460
6461 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6462 if (nsid != NVME_NSID_BROADCAST) {
6463 if (!nvme_nsid_valid(n, nsid)) {
6464 return NVME_INVALID_NSID | NVME_DNR;
6465 }
6466
6467 ns = nvme_ns(n, nsid);
6468 if (unlikely(!ns)) {
6469 return NVME_INVALID_FIELD | NVME_DNR;
6470 }
6471 }
6472 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
6473 if (!nvme_nsid_valid(n, nsid)) {
6474 return NVME_INVALID_NSID | NVME_DNR;
6475 }
6476
6477 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
6478 }
6479
6480 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
6481 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6482 }
6483
6484 switch (fid) {
6485 case NVME_TEMPERATURE_THRESHOLD:
6486 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6487 break;
6488 }
6489
6490 switch (NVME_TEMP_THSEL(dw11)) {
6491 case NVME_TEMP_THSEL_OVER:
6492 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
6493 break;
6494 case NVME_TEMP_THSEL_UNDER:
6495 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
6496 break;
6497 default:
6498 return NVME_INVALID_FIELD | NVME_DNR;
6499 }
6500
6501 if ((n->temperature >= n->features.temp_thresh_hi) ||
6502 (n->temperature <= n->features.temp_thresh_low)) {
6503 nvme_smart_event(n, NVME_SMART_TEMPERATURE);
6504 }
6505
6506 break;
6507 case NVME_ERROR_RECOVERY:
6508 if (nsid == NVME_NSID_BROADCAST) {
6509 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6510 ns = nvme_ns(n, i);
6511
6512 if (!ns) {
6513 continue;
6514 }
6515
6516 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6517 ns->features.err_rec = dw11;
6518 }
6519 }
6520
6521 break;
6522 }
6523
6524 assert(ns);
6525 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6526 ns->features.err_rec = dw11;
6527 }
6528 break;
6529 case NVME_VOLATILE_WRITE_CACHE:
6530 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6531 ns = nvme_ns(n, i);
6532 if (!ns) {
6533 continue;
6534 }
6535
6536 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
6537 blk_flush(ns->blkconf.blk);
6538 }
6539
6540 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
6541 }
6542
6543 break;
6544
6545 case NVME_NUMBER_OF_QUEUES:
6546 if (n->qs_created) {
6547 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6548 }
6549
6550 /*
6551 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
6552 * and NSQR.
6553 */
6554 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
6555 return NVME_INVALID_FIELD | NVME_DNR;
6556 }
6557
6558 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
6559 ((dw11 >> 16) & 0xffff) + 1,
6560 n->conf_ioqpairs,
6561 n->conf_ioqpairs);
6562 req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
6563 ((n->conf_ioqpairs - 1) << 16));
6564 break;
6565 case NVME_ASYNCHRONOUS_EVENT_CONF:
6566 n->features.async_config = dw11;
6567 break;
6568 case NVME_TIMESTAMP:
6569 return nvme_set_feature_timestamp(n, req);
6570 case NVME_HOST_BEHAVIOR_SUPPORT:
6571 status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
6572 sizeof(n->features.hbs), req);
6573 if (status) {
6574 return status;
6575 }
6576
6577 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6578 ns = nvme_ns(n, i);
6579
6580 if (!ns) {
6581 continue;
6582 }
6583
6584 ns->id_ns.nlbaf = ns->nlbaf - 1;
6585 if (!n->features.hbs.lbafee) {
6586 ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
6587 }
6588 }
6589
6590 return status;
6591 case NVME_COMMAND_SET_PROFILE:
6592 if (dw11 & 0x1ff) {
6593 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
6594 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
6595 }
6596 break;
6597 case NVME_FDP_MODE:
6598 /* spec: abort with cmd seq err if there's one or more NS' in endgrp */
6599 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6600 case NVME_FDP_EVENTS:
6601 return nvme_set_feature_fdp_events(n, ns, req);
6602 case NVME_WRITE_ATOMICITY:
6603
6604 n->dn = 0x1 & dw11;
6605
6606 if (n->dn) {
6607 atomic->atomic_max_write_size = le16_to_cpu(id->awupf) + 1;
6608 } else {
6609 atomic->atomic_max_write_size = le16_to_cpu(id->awun) + 1;
6610 }
6611
6612 if (atomic->atomic_max_write_size == 1) {
6613 atomic->atomic_writes = 0;
6614 } else {
6615 atomic->atomic_writes = 1;
6616 }
6617 break;
6618 default:
6619 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6620 }
6621 return NVME_SUCCESS;
6622 }
6623
nvme_aer(NvmeCtrl * n,NvmeRequest * req)6624 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
6625 {
6626 trace_pci_nvme_aer(nvme_cid(req));
6627
6628 if (n->outstanding_aers > n->params.aerl) {
6629 trace_pci_nvme_aer_aerl_exceeded();
6630 return NVME_AER_LIMIT_EXCEEDED;
6631 }
6632
6633 n->aer_reqs[n->outstanding_aers] = req;
6634 n->outstanding_aers++;
6635
6636 if (!QTAILQ_EMPTY(&n->aer_queue)) {
6637 nvme_process_aers(n);
6638 }
6639
6640 return NVME_NO_COMPLETE;
6641 }
6642
nvme_update_dmrsl(NvmeCtrl * n)6643 static void nvme_update_dmrsl(NvmeCtrl *n)
6644 {
6645 int nsid;
6646
6647 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
6648 NvmeNamespace *ns = nvme_ns(n, nsid);
6649 if (!ns) {
6650 continue;
6651 }
6652
6653 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6654 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6655 }
6656 }
6657
nvme_select_iocs_ns(NvmeCtrl * n,NvmeNamespace * ns)6658 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
6659 {
6660 uint32_t cc = ldl_le_p(&n->bar.cc);
6661
6662 ns->iocs = nvme_cse_iocs_none;
6663 switch (ns->csi) {
6664 case NVME_CSI_NVM:
6665 if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
6666 ns->iocs = nvme_cse_iocs_nvm;
6667 }
6668 break;
6669 case NVME_CSI_ZONED:
6670 if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
6671 ns->iocs = nvme_cse_iocs_zoned;
6672 } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
6673 ns->iocs = nvme_cse_iocs_nvm;
6674 }
6675 break;
6676 }
6677 }
6678
nvme_ns_attachment(NvmeCtrl * n,NvmeRequest * req)6679 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
6680 {
6681 NvmeNamespace *ns;
6682 NvmeCtrl *ctrl;
6683 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
6684 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6685 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6686 uint8_t sel = dw10 & 0xf;
6687 uint16_t *nr_ids = &list[0];
6688 uint16_t *ids = &list[1];
6689 uint16_t ret;
6690 int i;
6691
6692 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
6693
6694 if (!nvme_nsid_valid(n, nsid)) {
6695 return NVME_INVALID_NSID | NVME_DNR;
6696 }
6697
6698 ns = nvme_subsys_ns(n->subsys, nsid);
6699 if (!ns) {
6700 return NVME_INVALID_FIELD | NVME_DNR;
6701 }
6702
6703 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
6704 if (ret) {
6705 return ret;
6706 }
6707
6708 if (!*nr_ids) {
6709 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6710 }
6711
6712 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
6713 for (i = 0; i < *nr_ids; i++) {
6714 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
6715 if (!ctrl) {
6716 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6717 }
6718
6719 switch (sel) {
6720 case NVME_NS_ATTACHMENT_ATTACH:
6721 if (nvme_ns(ctrl, nsid)) {
6722 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
6723 }
6724
6725 if (ns->attached && !ns->params.shared) {
6726 return NVME_NS_PRIVATE | NVME_DNR;
6727 }
6728
6729 nvme_attach_ns(ctrl, ns);
6730 nvme_select_iocs_ns(ctrl, ns);
6731
6732 break;
6733
6734 case NVME_NS_ATTACHMENT_DETACH:
6735 if (!nvme_ns(ctrl, nsid)) {
6736 return NVME_NS_NOT_ATTACHED | NVME_DNR;
6737 }
6738
6739 ctrl->namespaces[nsid] = NULL;
6740 ns->attached--;
6741
6742 nvme_update_dmrsl(ctrl);
6743
6744 break;
6745
6746 default:
6747 return NVME_INVALID_FIELD | NVME_DNR;
6748 }
6749
6750 /*
6751 * Add namespace id to the changed namespace id list for event clearing
6752 * via Get Log Page command.
6753 */
6754 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
6755 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
6756 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
6757 NVME_LOG_CHANGED_NSLIST);
6758 }
6759 }
6760
6761 return NVME_SUCCESS;
6762 }
6763
6764 typedef struct NvmeFormatAIOCB {
6765 BlockAIOCB common;
6766 BlockAIOCB *aiocb;
6767 NvmeRequest *req;
6768 int ret;
6769
6770 NvmeNamespace *ns;
6771 uint32_t nsid;
6772 bool broadcast;
6773 int64_t offset;
6774
6775 uint8_t lbaf;
6776 uint8_t mset;
6777 uint8_t pi;
6778 uint8_t pil;
6779 } NvmeFormatAIOCB;
6780
nvme_format_cancel(BlockAIOCB * aiocb)6781 static void nvme_format_cancel(BlockAIOCB *aiocb)
6782 {
6783 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
6784
6785 iocb->ret = -ECANCELED;
6786
6787 if (iocb->aiocb) {
6788 blk_aio_cancel_async(iocb->aiocb);
6789 iocb->aiocb = NULL;
6790 }
6791 }
6792
6793 static const AIOCBInfo nvme_format_aiocb_info = {
6794 .aiocb_size = sizeof(NvmeFormatAIOCB),
6795 .cancel_async = nvme_format_cancel,
6796 };
6797
nvme_format_set(NvmeNamespace * ns,uint8_t lbaf,uint8_t mset,uint8_t pi,uint8_t pil)6798 static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
6799 uint8_t pi, uint8_t pil)
6800 {
6801 uint8_t lbafl = lbaf & 0xf;
6802 uint8_t lbafu = lbaf >> 4;
6803
6804 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
6805
6806 ns->id_ns.dps = (pil << 3) | pi;
6807 ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
6808
6809 nvme_ns_init_format(ns);
6810 }
6811
6812 static void nvme_do_format(NvmeFormatAIOCB *iocb);
6813
nvme_format_ns_cb(void * opaque,int ret)6814 static void nvme_format_ns_cb(void *opaque, int ret)
6815 {
6816 NvmeFormatAIOCB *iocb = opaque;
6817 NvmeNamespace *ns = iocb->ns;
6818 int bytes;
6819
6820 if (iocb->ret < 0) {
6821 goto done;
6822 } else if (ret < 0) {
6823 iocb->ret = ret;
6824 goto done;
6825 }
6826
6827 assert(ns);
6828
6829 if (iocb->offset < ns->size) {
6830 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
6831
6832 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
6833 bytes, BDRV_REQ_MAY_UNMAP,
6834 nvme_format_ns_cb, iocb);
6835
6836 iocb->offset += bytes;
6837 return;
6838 }
6839
6840 nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
6841 ns->status = 0x0;
6842 iocb->ns = NULL;
6843 iocb->offset = 0;
6844
6845 done:
6846 nvme_do_format(iocb);
6847 }
6848
nvme_format_check(NvmeNamespace * ns,uint8_t lbaf,uint8_t pi)6849 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
6850 {
6851 if (ns->params.zoned) {
6852 return NVME_INVALID_FORMAT | NVME_DNR;
6853 }
6854
6855 if (lbaf > ns->id_ns.nlbaf) {
6856 return NVME_INVALID_FORMAT | NVME_DNR;
6857 }
6858
6859 if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
6860 return NVME_INVALID_FORMAT | NVME_DNR;
6861 }
6862
6863 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
6864 return NVME_INVALID_FIELD | NVME_DNR;
6865 }
6866
6867 return NVME_SUCCESS;
6868 }
6869
nvme_do_format(NvmeFormatAIOCB * iocb)6870 static void nvme_do_format(NvmeFormatAIOCB *iocb)
6871 {
6872 NvmeRequest *req = iocb->req;
6873 NvmeCtrl *n = nvme_ctrl(req);
6874 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6875 uint8_t lbaf = dw10 & 0xf;
6876 uint8_t pi = (dw10 >> 5) & 0x7;
6877 uint16_t status;
6878 int i;
6879
6880 if (iocb->ret < 0) {
6881 goto done;
6882 }
6883
6884 if (iocb->broadcast) {
6885 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
6886 iocb->ns = nvme_ns(n, i);
6887 if (iocb->ns) {
6888 iocb->nsid = i;
6889 break;
6890 }
6891 }
6892 }
6893
6894 if (!iocb->ns) {
6895 goto done;
6896 }
6897
6898 status = nvme_format_check(iocb->ns, lbaf, pi);
6899 if (status) {
6900 req->status = status;
6901 goto done;
6902 }
6903
6904 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
6905 nvme_format_ns_cb(iocb, 0);
6906 return;
6907
6908 done:
6909 iocb->common.cb(iocb->common.opaque, iocb->ret);
6910 qemu_aio_unref(iocb);
6911 }
6912
nvme_format(NvmeCtrl * n,NvmeRequest * req)6913 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
6914 {
6915 NvmeFormatAIOCB *iocb;
6916 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6917 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6918 uint8_t lbaf = dw10 & 0xf;
6919 uint8_t mset = (dw10 >> 4) & 0x1;
6920 uint8_t pi = (dw10 >> 5) & 0x7;
6921 uint8_t pil = (dw10 >> 8) & 0x1;
6922 uint8_t lbafu = (dw10 >> 12) & 0x3;
6923 uint16_t status;
6924
6925 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
6926
6927 iocb->req = req;
6928 iocb->ret = 0;
6929 iocb->ns = NULL;
6930 iocb->nsid = 0;
6931 iocb->lbaf = lbaf;
6932 iocb->mset = mset;
6933 iocb->pi = pi;
6934 iocb->pil = pil;
6935 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
6936 iocb->offset = 0;
6937
6938 if (n->features.hbs.lbafee) {
6939 iocb->lbaf |= lbafu << 4;
6940 }
6941
6942 if (!iocb->broadcast) {
6943 if (!nvme_nsid_valid(n, nsid)) {
6944 status = NVME_INVALID_NSID | NVME_DNR;
6945 goto out;
6946 }
6947
6948 iocb->ns = nvme_ns(n, nsid);
6949 if (!iocb->ns) {
6950 status = NVME_INVALID_FIELD | NVME_DNR;
6951 goto out;
6952 }
6953 }
6954
6955 req->aiocb = &iocb->common;
6956 nvme_do_format(iocb);
6957
6958 return NVME_NO_COMPLETE;
6959
6960 out:
6961 qemu_aio_unref(iocb);
6962
6963 return status;
6964 }
6965
nvme_get_virt_res_num(NvmeCtrl * n,uint8_t rt,int * num_total,int * num_prim,int * num_sec)6966 static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
6967 int *num_prim, int *num_sec)
6968 {
6969 *num_total = le32_to_cpu(rt ?
6970 n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt);
6971 *num_prim = le16_to_cpu(rt ?
6972 n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap);
6973 *num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
6974 }
6975
nvme_assign_virt_res_to_prim(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)6976 static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
6977 uint16_t cntlid, uint8_t rt,
6978 int nr)
6979 {
6980 int num_total, num_prim, num_sec;
6981
6982 if (cntlid != n->cntlid) {
6983 return NVME_INVALID_CTRL_ID | NVME_DNR;
6984 }
6985
6986 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
6987
6988 if (nr > num_total) {
6989 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
6990 }
6991
6992 if (nr > num_total - num_sec) {
6993 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6994 }
6995
6996 if (rt) {
6997 n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
6998 } else {
6999 n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
7000 }
7001
7002 req->cqe.result = cpu_to_le32(nr);
7003 return req->status;
7004 }
7005
nvme_update_virt_res(NvmeCtrl * n,NvmeSecCtrlEntry * sctrl,uint8_t rt,int nr)7006 static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
7007 uint8_t rt, int nr)
7008 {
7009 int prev_nr, prev_total;
7010
7011 if (rt) {
7012 prev_nr = le16_to_cpu(sctrl->nvi);
7013 prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
7014 sctrl->nvi = cpu_to_le16(nr);
7015 n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
7016 } else {
7017 prev_nr = le16_to_cpu(sctrl->nvq);
7018 prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
7019 sctrl->nvq = cpu_to_le16(nr);
7020 n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
7021 }
7022 }
7023
nvme_assign_virt_res_to_sec(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)7024 static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
7025 uint16_t cntlid, uint8_t rt, int nr)
7026 {
7027 int num_total, num_prim, num_sec, num_free, diff, limit;
7028 NvmeSecCtrlEntry *sctrl;
7029
7030 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
7031 if (!sctrl) {
7032 return NVME_INVALID_CTRL_ID | NVME_DNR;
7033 }
7034
7035 if (sctrl->scs) {
7036 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
7037 }
7038
7039 limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
7040 if (nr > limit) {
7041 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
7042 }
7043
7044 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
7045 num_free = num_total - num_prim - num_sec;
7046 diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
7047
7048 if (diff > num_free) {
7049 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7050 }
7051
7052 nvme_update_virt_res(n, sctrl, rt, nr);
7053 req->cqe.result = cpu_to_le32(nr);
7054
7055 return req->status;
7056 }
7057
nvme_virt_set_state(NvmeCtrl * n,uint16_t cntlid,bool online)7058 static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
7059 {
7060 PCIDevice *pci = PCI_DEVICE(n);
7061 NvmeCtrl *sn = NULL;
7062 NvmeSecCtrlEntry *sctrl;
7063 int vf_index;
7064
7065 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
7066 if (!sctrl) {
7067 return NVME_INVALID_CTRL_ID | NVME_DNR;
7068 }
7069
7070 if (!pci_is_vf(pci)) {
7071 vf_index = le16_to_cpu(sctrl->vfn) - 1;
7072 sn = NVME(pcie_sriov_get_vf_at_index(pci, vf_index));
7073 }
7074
7075 if (online) {
7076 if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
7077 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
7078 }
7079
7080 if (!sctrl->scs) {
7081 sctrl->scs = 0x1;
7082 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7083 }
7084 } else {
7085 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_INTERRUPT, 0);
7086 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_QUEUE, 0);
7087
7088 if (sctrl->scs) {
7089 sctrl->scs = 0x0;
7090 if (sn) {
7091 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7092 }
7093 }
7094 }
7095
7096 return NVME_SUCCESS;
7097 }
7098
nvme_virt_mngmt(NvmeCtrl * n,NvmeRequest * req)7099 static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
7100 {
7101 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7102 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7103 uint8_t act = dw10 & 0xf;
7104 uint8_t rt = (dw10 >> 8) & 0x7;
7105 uint16_t cntlid = (dw10 >> 16) & 0xffff;
7106 int nr = dw11 & 0xffff;
7107
7108 trace_pci_nvme_virt_mngmt(nvme_cid(req), act, cntlid, rt ? "VI" : "VQ", nr);
7109
7110 if (rt != NVME_VIRT_RES_QUEUE && rt != NVME_VIRT_RES_INTERRUPT) {
7111 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7112 }
7113
7114 switch (act) {
7115 case NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN:
7116 return nvme_assign_virt_res_to_sec(n, req, cntlid, rt, nr);
7117 case NVME_VIRT_MNGMT_ACTION_PRM_ALLOC:
7118 return nvme_assign_virt_res_to_prim(n, req, cntlid, rt, nr);
7119 case NVME_VIRT_MNGMT_ACTION_SEC_ONLINE:
7120 return nvme_virt_set_state(n, cntlid, true);
7121 case NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE:
7122 return nvme_virt_set_state(n, cntlid, false);
7123 default:
7124 return NVME_INVALID_FIELD | NVME_DNR;
7125 }
7126 }
7127
nvme_dbbuf_config(NvmeCtrl * n,const NvmeRequest * req)7128 static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
7129 {
7130 PCIDevice *pci = PCI_DEVICE(n);
7131 uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
7132 uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
7133 int i;
7134
7135 /* Address should be page aligned */
7136 if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
7137 return NVME_INVALID_FIELD | NVME_DNR;
7138 }
7139
7140 /* Save shadow buffer base addr for use during queue creation */
7141 n->dbbuf_dbs = dbs_addr;
7142 n->dbbuf_eis = eis_addr;
7143 n->dbbuf_enabled = true;
7144
7145 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7146 NvmeSQueue *sq = n->sq[i];
7147 NvmeCQueue *cq = n->cq[i];
7148
7149 if (sq) {
7150 /*
7151 * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
7152 * nvme_process_db() uses this hard-coded way to calculate
7153 * doorbell offsets. Be consistent with that here.
7154 */
7155 sq->db_addr = dbs_addr + (i << 3);
7156 sq->ei_addr = eis_addr + (i << 3);
7157 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
7158
7159 if (n->params.ioeventfd && sq->sqid != 0) {
7160 if (!nvme_init_sq_ioeventfd(sq)) {
7161 sq->ioeventfd_enabled = true;
7162 }
7163 }
7164 }
7165
7166 if (cq) {
7167 /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
7168 cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
7169 cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
7170 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
7171
7172 if (n->params.ioeventfd && cq->cqid != 0) {
7173 if (!nvme_init_cq_ioeventfd(cq)) {
7174 cq->ioeventfd_enabled = true;
7175 }
7176 }
7177 }
7178 }
7179
7180 trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr);
7181
7182 return NVME_SUCCESS;
7183 }
7184
nvme_directive_send(NvmeCtrl * n,NvmeRequest * req)7185 static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req)
7186 {
7187 return NVME_INVALID_FIELD | NVME_DNR;
7188 }
7189
nvme_directive_receive(NvmeCtrl * n,NvmeRequest * req)7190 static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
7191 {
7192 NvmeNamespace *ns;
7193 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7194 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7195 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
7196 uint8_t doper, dtype;
7197 uint32_t numd, trans_len;
7198 NvmeDirectiveIdentify id = {
7199 .supported = 1 << NVME_DIRECTIVE_IDENTIFY,
7200 .enabled = 1 << NVME_DIRECTIVE_IDENTIFY,
7201 };
7202
7203 numd = dw10 + 1;
7204 doper = dw11 & 0xff;
7205 dtype = (dw11 >> 8) & 0xff;
7206
7207 trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2);
7208
7209 if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY ||
7210 doper != NVME_DIRECTIVE_RETURN_PARAMS) {
7211 return NVME_INVALID_FIELD | NVME_DNR;
7212 }
7213
7214 ns = nvme_ns(n, nsid);
7215 if (!ns) {
7216 return NVME_INVALID_FIELD | NVME_DNR;
7217 }
7218
7219 switch (dtype) {
7220 case NVME_DIRECTIVE_IDENTIFY:
7221 switch (doper) {
7222 case NVME_DIRECTIVE_RETURN_PARAMS:
7223 if (ns->endgrp && ns->endgrp->fdp.enabled) {
7224 id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7225 id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7226 id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7227 }
7228
7229 return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
7230
7231 default:
7232 return NVME_INVALID_FIELD | NVME_DNR;
7233 }
7234
7235 default:
7236 return NVME_INVALID_FIELD;
7237 }
7238 }
7239
nvme_admin_cmd(NvmeCtrl * n,NvmeRequest * req)7240 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
7241 {
7242 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
7243 nvme_adm_opc_str(req->cmd.opcode));
7244
7245 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
7246 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
7247 return NVME_INVALID_OPCODE | NVME_DNR;
7248 }
7249
7250 /* SGLs shall not be used for Admin commands in NVMe over PCIe */
7251 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
7252 return NVME_INVALID_FIELD | NVME_DNR;
7253 }
7254
7255 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
7256 return NVME_INVALID_FIELD;
7257 }
7258
7259 switch (req->cmd.opcode) {
7260 case NVME_ADM_CMD_DELETE_SQ:
7261 return nvme_del_sq(n, req);
7262 case NVME_ADM_CMD_CREATE_SQ:
7263 return nvme_create_sq(n, req);
7264 case NVME_ADM_CMD_GET_LOG_PAGE:
7265 return nvme_get_log(n, req);
7266 case NVME_ADM_CMD_DELETE_CQ:
7267 return nvme_del_cq(n, req);
7268 case NVME_ADM_CMD_CREATE_CQ:
7269 return nvme_create_cq(n, req);
7270 case NVME_ADM_CMD_IDENTIFY:
7271 return nvme_identify(n, req);
7272 case NVME_ADM_CMD_ABORT:
7273 return nvme_abort(n, req);
7274 case NVME_ADM_CMD_SET_FEATURES:
7275 return nvme_set_feature(n, req);
7276 case NVME_ADM_CMD_GET_FEATURES:
7277 return nvme_get_feature(n, req);
7278 case NVME_ADM_CMD_ASYNC_EV_REQ:
7279 return nvme_aer(n, req);
7280 case NVME_ADM_CMD_NS_ATTACHMENT:
7281 return nvme_ns_attachment(n, req);
7282 case NVME_ADM_CMD_VIRT_MNGMT:
7283 return nvme_virt_mngmt(n, req);
7284 case NVME_ADM_CMD_DBBUF_CONFIG:
7285 return nvme_dbbuf_config(n, req);
7286 case NVME_ADM_CMD_FORMAT_NVM:
7287 return nvme_format(n, req);
7288 case NVME_ADM_CMD_DIRECTIVE_SEND:
7289 return nvme_directive_send(n, req);
7290 case NVME_ADM_CMD_DIRECTIVE_RECV:
7291 return nvme_directive_receive(n, req);
7292 default:
7293 g_assert_not_reached();
7294 }
7295
7296 return NVME_INVALID_OPCODE | NVME_DNR;
7297 }
7298
nvme_update_sq_eventidx(const NvmeSQueue * sq)7299 static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
7300 {
7301 trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail);
7302
7303 stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail,
7304 MEMTXATTRS_UNSPECIFIED);
7305 }
7306
nvme_update_sq_tail(NvmeSQueue * sq)7307 static void nvme_update_sq_tail(NvmeSQueue *sq)
7308 {
7309 ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, &sq->tail,
7310 MEMTXATTRS_UNSPECIFIED);
7311
7312 trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail);
7313 }
7314
7315 #define NVME_ATOMIC_NO_START 0
7316 #define NVME_ATOMIC_START_ATOMIC 1
7317 #define NVME_ATOMIC_START_NONATOMIC 2
7318
nvme_atomic_write_check(NvmeCtrl * n,NvmeCmd * cmd,NvmeAtomic * atomic)7319 static int nvme_atomic_write_check(NvmeCtrl *n, NvmeCmd *cmd,
7320 NvmeAtomic *atomic)
7321 {
7322 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
7323 uint64_t slba = le64_to_cpu(rw->slba);
7324 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb);
7325 uint64_t elba = slba + nlb;
7326 bool cmd_atomic_wr = true;
7327 int i;
7328
7329 if ((cmd->opcode == NVME_CMD_READ) || ((cmd->opcode == NVME_CMD_WRITE) &&
7330 ((rw->nlb + 1) > atomic->atomic_max_write_size))) {
7331 cmd_atomic_wr = false;
7332 }
7333
7334 /*
7335 * Walk the queues to see if there are any atomic conflicts.
7336 */
7337 for (i = 1; i < n->params.max_ioqpairs + 1; i++) {
7338 NvmeSQueue *sq;
7339 NvmeRequest *req;
7340 NvmeRwCmd *req_rw;
7341 uint64_t req_slba;
7342 uint32_t req_nlb;
7343 uint64_t req_elba;
7344
7345 sq = n->sq[i];
7346 if (!sq) {
7347 continue;
7348 }
7349
7350 /*
7351 * Walk all the requests on a given queue.
7352 */
7353 QTAILQ_FOREACH(req, &sq->out_req_list, entry) {
7354 req_rw = (NvmeRwCmd *)&req->cmd;
7355
7356 if (((req_rw->opcode == NVME_CMD_WRITE) ||
7357 (req_rw->opcode == NVME_CMD_READ)) &&
7358 (cmd->nsid == req->ns->params.nsid)) {
7359 req_slba = le64_to_cpu(req_rw->slba);
7360 req_nlb = (uint32_t)le16_to_cpu(req_rw->nlb);
7361 req_elba = req_slba + req_nlb;
7362
7363 if (cmd_atomic_wr) {
7364 if ((elba >= req_slba) && (slba <= req_elba)) {
7365 return NVME_ATOMIC_NO_START;
7366 }
7367 } else {
7368 if (req->atomic_write && ((elba >= req_slba) &&
7369 (slba <= req_elba))) {
7370 return NVME_ATOMIC_NO_START;
7371 }
7372 }
7373 }
7374 }
7375 }
7376 if (cmd_atomic_wr) {
7377 return NVME_ATOMIC_START_ATOMIC;
7378 }
7379 return NVME_ATOMIC_START_NONATOMIC;
7380 }
7381
nvme_get_atomic(NvmeCtrl * n,NvmeCmd * cmd)7382 static NvmeAtomic *nvme_get_atomic(NvmeCtrl *n, NvmeCmd *cmd)
7383 {
7384 if (n->atomic.atomic_writes) {
7385 return &n->atomic;
7386 }
7387 return NULL;
7388 }
7389
nvme_process_sq(void * opaque)7390 static void nvme_process_sq(void *opaque)
7391 {
7392 NvmeSQueue *sq = opaque;
7393 NvmeCtrl *n = sq->ctrl;
7394 NvmeCQueue *cq = n->cq[sq->cqid];
7395
7396 uint16_t status;
7397 hwaddr addr;
7398 NvmeCmd cmd;
7399 NvmeRequest *req;
7400
7401 if (n->dbbuf_enabled) {
7402 nvme_update_sq_tail(sq);
7403 }
7404
7405 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
7406 NvmeAtomic *atomic;
7407 bool cmd_is_atomic;
7408
7409 addr = sq->dma_addr + (sq->head << NVME_SQES);
7410 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
7411 trace_pci_nvme_err_addr_read(addr);
7412 trace_pci_nvme_err_cfs();
7413 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
7414 break;
7415 }
7416
7417 atomic = nvme_get_atomic(n, &cmd);
7418
7419 cmd_is_atomic = false;
7420 if (sq->sqid && atomic) {
7421 int ret;
7422
7423 ret = nvme_atomic_write_check(n, &cmd, atomic);
7424 switch (ret) {
7425 case NVME_ATOMIC_NO_START:
7426 qemu_bh_schedule(sq->bh);
7427 return;
7428 case NVME_ATOMIC_START_ATOMIC:
7429 cmd_is_atomic = true;
7430 break;
7431 case NVME_ATOMIC_START_NONATOMIC:
7432 default:
7433 break;
7434 }
7435 }
7436 nvme_inc_sq_head(sq);
7437
7438 req = QTAILQ_FIRST(&sq->req_list);
7439 QTAILQ_REMOVE(&sq->req_list, req, entry);
7440 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
7441 nvme_req_clear(req);
7442 req->cqe.cid = cmd.cid;
7443 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
7444
7445 if (sq->sqid && atomic) {
7446 req->atomic_write = cmd_is_atomic;
7447 }
7448
7449 status = sq->sqid ? nvme_io_cmd(n, req) :
7450 nvme_admin_cmd(n, req);
7451 if (status != NVME_NO_COMPLETE) {
7452 req->status = status;
7453 nvme_enqueue_req_completion(cq, req);
7454 }
7455
7456 if (n->dbbuf_enabled) {
7457 nvme_update_sq_eventidx(sq);
7458 nvme_update_sq_tail(sq);
7459 }
7460 }
7461 }
7462
nvme_update_msixcap_ts(PCIDevice * pci_dev,uint32_t table_size)7463 static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size)
7464 {
7465 uint8_t *config;
7466
7467 if (!msix_present(pci_dev)) {
7468 return;
7469 }
7470
7471 assert(table_size > 0 && table_size <= pci_dev->msix_entries_nr);
7472
7473 config = pci_dev->config + pci_dev->msix_cap;
7474 pci_set_word_by_mask(config + PCI_MSIX_FLAGS, PCI_MSIX_FLAGS_QSIZE,
7475 table_size - 1);
7476 }
7477
nvme_activate_virt_res(NvmeCtrl * n)7478 static void nvme_activate_virt_res(NvmeCtrl *n)
7479 {
7480 PCIDevice *pci_dev = PCI_DEVICE(n);
7481 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7482 NvmeSecCtrlEntry *sctrl;
7483
7484 /* -1 to account for the admin queue */
7485 if (pci_is_vf(pci_dev)) {
7486 sctrl = nvme_sctrl(n);
7487 cap->vqprt = sctrl->nvq;
7488 cap->viprt = sctrl->nvi;
7489 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
7490 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
7491 } else {
7492 cap->vqrfap = n->next_pri_ctrl_cap.vqrfap;
7493 cap->virfap = n->next_pri_ctrl_cap.virfap;
7494 n->conf_ioqpairs = le16_to_cpu(cap->vqprt) +
7495 le16_to_cpu(cap->vqrfap) - 1;
7496 n->conf_msix_qsize = le16_to_cpu(cap->viprt) +
7497 le16_to_cpu(cap->virfap);
7498 }
7499 }
7500
nvme_ctrl_reset(NvmeCtrl * n,NvmeResetType rst)7501 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
7502 {
7503 PCIDevice *pci_dev = PCI_DEVICE(n);
7504 NvmeSecCtrlEntry *sctrl;
7505 NvmeNamespace *ns;
7506 int i;
7507
7508 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7509 ns = nvme_ns(n, i);
7510 if (!ns) {
7511 continue;
7512 }
7513
7514 nvme_ns_drain(ns);
7515 }
7516
7517 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7518 if (n->sq[i] != NULL) {
7519 nvme_free_sq(n->sq[i], n);
7520 }
7521 }
7522 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7523 if (n->cq[i] != NULL) {
7524 nvme_free_cq(n->cq[i], n);
7525 }
7526 }
7527
7528 while (!QTAILQ_EMPTY(&n->aer_queue)) {
7529 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
7530 QTAILQ_REMOVE(&n->aer_queue, event, entry);
7531 g_free(event);
7532 }
7533
7534 if (n->params.sriov_max_vfs) {
7535 if (!pci_is_vf(pci_dev)) {
7536 for (i = 0; i < n->nr_sec_ctrls; i++) {
7537 sctrl = &n->sec_ctrl_list[i];
7538 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
7539 }
7540 }
7541
7542 if (rst != NVME_RESET_CONTROLLER) {
7543 nvme_activate_virt_res(n);
7544 }
7545 }
7546
7547 n->aer_queued = 0;
7548 n->aer_mask = 0;
7549 n->outstanding_aers = 0;
7550 n->qs_created = false;
7551
7552 n->dn = n->params.atomic_dn; /* Set Disable Normal */
7553
7554 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
7555
7556 if (pci_is_vf(pci_dev)) {
7557 sctrl = nvme_sctrl(n);
7558
7559 stl_le_p(&n->bar.csts, sctrl->scs ? 0 : NVME_CSTS_FAILED);
7560 } else {
7561 stl_le_p(&n->bar.csts, 0);
7562 }
7563
7564 stl_le_p(&n->bar.intms, 0);
7565 stl_le_p(&n->bar.intmc, 0);
7566 stl_le_p(&n->bar.cc, 0);
7567
7568 n->dbbuf_dbs = 0;
7569 n->dbbuf_eis = 0;
7570 n->dbbuf_enabled = false;
7571 }
7572
nvme_ctrl_shutdown(NvmeCtrl * n)7573 static void nvme_ctrl_shutdown(NvmeCtrl *n)
7574 {
7575 NvmeNamespace *ns;
7576 int i;
7577
7578 if (n->pmr.dev) {
7579 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7580 }
7581
7582 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7583 ns = nvme_ns(n, i);
7584 if (!ns) {
7585 continue;
7586 }
7587
7588 nvme_ns_shutdown(ns);
7589 }
7590 }
7591
nvme_select_iocs(NvmeCtrl * n)7592 static void nvme_select_iocs(NvmeCtrl *n)
7593 {
7594 NvmeNamespace *ns;
7595 int i;
7596
7597 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7598 ns = nvme_ns(n, i);
7599 if (!ns) {
7600 continue;
7601 }
7602
7603 nvme_select_iocs_ns(n, ns);
7604 }
7605 }
7606
nvme_start_ctrl(NvmeCtrl * n)7607 static int nvme_start_ctrl(NvmeCtrl *n)
7608 {
7609 uint64_t cap = ldq_le_p(&n->bar.cap);
7610 uint32_t cc = ldl_le_p(&n->bar.cc);
7611 uint32_t aqa = ldl_le_p(&n->bar.aqa);
7612 uint64_t asq = ldq_le_p(&n->bar.asq);
7613 uint64_t acq = ldq_le_p(&n->bar.acq);
7614 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
7615 uint32_t page_size = 1 << page_bits;
7616 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
7617
7618 if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) {
7619 trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
7620 le16_to_cpu(sctrl->nvq));
7621 return -1;
7622 }
7623 if (unlikely(n->cq[0])) {
7624 trace_pci_nvme_err_startfail_cq();
7625 return -1;
7626 }
7627 if (unlikely(n->sq[0])) {
7628 trace_pci_nvme_err_startfail_sq();
7629 return -1;
7630 }
7631 if (unlikely(asq & (page_size - 1))) {
7632 trace_pci_nvme_err_startfail_asq_misaligned(asq);
7633 return -1;
7634 }
7635 if (unlikely(acq & (page_size - 1))) {
7636 trace_pci_nvme_err_startfail_acq_misaligned(acq);
7637 return -1;
7638 }
7639 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
7640 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
7641 return -1;
7642 }
7643 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
7644 trace_pci_nvme_err_startfail_page_too_small(
7645 NVME_CC_MPS(cc),
7646 NVME_CAP_MPSMIN(cap));
7647 return -1;
7648 }
7649 if (unlikely(NVME_CC_MPS(cc) >
7650 NVME_CAP_MPSMAX(cap))) {
7651 trace_pci_nvme_err_startfail_page_too_large(
7652 NVME_CC_MPS(cc),
7653 NVME_CAP_MPSMAX(cap));
7654 return -1;
7655 }
7656 if (unlikely(!NVME_AQA_ASQS(aqa))) {
7657 trace_pci_nvme_err_startfail_asqent_sz_zero();
7658 return -1;
7659 }
7660 if (unlikely(!NVME_AQA_ACQS(aqa))) {
7661 trace_pci_nvme_err_startfail_acqent_sz_zero();
7662 return -1;
7663 }
7664
7665 n->page_bits = page_bits;
7666 n->page_size = page_size;
7667 n->max_prp_ents = n->page_size / sizeof(uint64_t);
7668 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
7669 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
7670
7671 nvme_set_timestamp(n, 0ULL);
7672
7673 nvme_select_iocs(n);
7674
7675 return 0;
7676 }
7677
nvme_cmb_enable_regs(NvmeCtrl * n)7678 static void nvme_cmb_enable_regs(NvmeCtrl *n)
7679 {
7680 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
7681 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
7682
7683 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
7684 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
7685 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
7686 stl_le_p(&n->bar.cmbloc, cmbloc);
7687
7688 NVME_CMBSZ_SET_SQS(cmbsz, 1);
7689 NVME_CMBSZ_SET_CQS(cmbsz, 0);
7690 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
7691 NVME_CMBSZ_SET_RDS(cmbsz, 1);
7692 NVME_CMBSZ_SET_WDS(cmbsz, 1);
7693 NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
7694 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
7695 stl_le_p(&n->bar.cmbsz, cmbsz);
7696 }
7697
nvme_write_bar(NvmeCtrl * n,hwaddr offset,uint64_t data,unsigned size)7698 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
7699 unsigned size)
7700 {
7701 PCIDevice *pci = PCI_DEVICE(n);
7702 uint64_t cap = ldq_le_p(&n->bar.cap);
7703 uint32_t cc = ldl_le_p(&n->bar.cc);
7704 uint32_t intms = ldl_le_p(&n->bar.intms);
7705 uint32_t csts = ldl_le_p(&n->bar.csts);
7706 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
7707
7708 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
7709 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
7710 "MMIO write not 32-bit aligned,"
7711 " offset=0x%"PRIx64"", offset);
7712 /* should be ignored, fall through for now */
7713 }
7714
7715 if (unlikely(size < sizeof(uint32_t))) {
7716 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
7717 "MMIO write smaller than 32-bits,"
7718 " offset=0x%"PRIx64", size=%u",
7719 offset, size);
7720 /* should be ignored, fall through for now */
7721 }
7722
7723 switch (offset) {
7724 case NVME_REG_INTMS:
7725 if (unlikely(msix_enabled(pci))) {
7726 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7727 "undefined access to interrupt mask set"
7728 " when MSI-X is enabled");
7729 /* should be ignored, fall through for now */
7730 }
7731 intms |= data;
7732 stl_le_p(&n->bar.intms, intms);
7733 n->bar.intmc = n->bar.intms;
7734 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
7735 nvme_irq_check(n);
7736 break;
7737 case NVME_REG_INTMC:
7738 if (unlikely(msix_enabled(pci))) {
7739 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7740 "undefined access to interrupt mask clr"
7741 " when MSI-X is enabled");
7742 /* should be ignored, fall through for now */
7743 }
7744 intms &= ~data;
7745 stl_le_p(&n->bar.intms, intms);
7746 n->bar.intmc = n->bar.intms;
7747 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
7748 nvme_irq_check(n);
7749 break;
7750 case NVME_REG_CC:
7751 stl_le_p(&n->bar.cc, data);
7752
7753 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
7754
7755 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
7756 trace_pci_nvme_mmio_shutdown_set();
7757 nvme_ctrl_shutdown(n);
7758 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7759 csts |= NVME_CSTS_SHST_COMPLETE;
7760 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
7761 trace_pci_nvme_mmio_shutdown_cleared();
7762 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7763 }
7764
7765 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
7766 if (unlikely(nvme_start_ctrl(n))) {
7767 trace_pci_nvme_err_startfail();
7768 csts = NVME_CSTS_FAILED;
7769 } else {
7770 trace_pci_nvme_mmio_start_success();
7771 csts = NVME_CSTS_READY;
7772 }
7773 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
7774 trace_pci_nvme_mmio_stopped();
7775 nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
7776
7777 break;
7778 }
7779
7780 stl_le_p(&n->bar.csts, csts);
7781
7782 break;
7783 case NVME_REG_CSTS:
7784 if (data & (1 << 4)) {
7785 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
7786 "attempted to W1C CSTS.NSSRO"
7787 " but CAP.NSSRS is zero (not supported)");
7788 } else if (data != 0) {
7789 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
7790 "attempted to set a read only bit"
7791 " of controller status");
7792 }
7793 break;
7794 case NVME_REG_NSSR:
7795 if (data == 0x4e564d65) {
7796 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
7797 } else {
7798 /* The spec says that writes of other values have no effect */
7799 return;
7800 }
7801 break;
7802 case NVME_REG_AQA:
7803 stl_le_p(&n->bar.aqa, data);
7804 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
7805 break;
7806 case NVME_REG_ASQ:
7807 stn_le_p(&n->bar.asq, size, data);
7808 trace_pci_nvme_mmio_asqaddr(data);
7809 break;
7810 case NVME_REG_ASQ + 4:
7811 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
7812 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
7813 break;
7814 case NVME_REG_ACQ:
7815 trace_pci_nvme_mmio_acqaddr(data);
7816 stn_le_p(&n->bar.acq, size, data);
7817 break;
7818 case NVME_REG_ACQ + 4:
7819 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
7820 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
7821 break;
7822 case NVME_REG_CMBLOC:
7823 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
7824 "invalid write to reserved CMBLOC"
7825 " when CMBSZ is zero, ignored");
7826 return;
7827 case NVME_REG_CMBSZ:
7828 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
7829 "invalid write to read only CMBSZ, ignored");
7830 return;
7831 case NVME_REG_CMBMSC:
7832 if (!NVME_CAP_CMBS(cap)) {
7833 return;
7834 }
7835
7836 stn_le_p(&n->bar.cmbmsc, size, data);
7837 n->cmb.cmse = false;
7838
7839 if (NVME_CMBMSC_CRE(data)) {
7840 nvme_cmb_enable_regs(n);
7841
7842 if (NVME_CMBMSC_CMSE(data)) {
7843 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
7844 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
7845 if (cba + int128_get64(n->cmb.mem.size) < cba) {
7846 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
7847 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
7848 stl_le_p(&n->bar.cmbsts, cmbsts);
7849 return;
7850 }
7851
7852 n->cmb.cba = cba;
7853 n->cmb.cmse = true;
7854 }
7855 } else {
7856 n->bar.cmbsz = 0;
7857 n->bar.cmbloc = 0;
7858 }
7859
7860 return;
7861 case NVME_REG_CMBMSC + 4:
7862 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
7863 return;
7864
7865 case NVME_REG_PMRCAP:
7866 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
7867 "invalid write to PMRCAP register, ignored");
7868 return;
7869 case NVME_REG_PMRCTL:
7870 if (!NVME_CAP_PMRS(cap)) {
7871 return;
7872 }
7873
7874 stl_le_p(&n->bar.pmrctl, data);
7875 if (NVME_PMRCTL_EN(data)) {
7876 memory_region_set_enabled(&n->pmr.dev->mr, true);
7877 pmrsts = 0;
7878 } else {
7879 memory_region_set_enabled(&n->pmr.dev->mr, false);
7880 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
7881 n->pmr.cmse = false;
7882 }
7883 stl_le_p(&n->bar.pmrsts, pmrsts);
7884 return;
7885 case NVME_REG_PMRSTS:
7886 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
7887 "invalid write to PMRSTS register, ignored");
7888 return;
7889 case NVME_REG_PMREBS:
7890 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
7891 "invalid write to PMREBS register, ignored");
7892 return;
7893 case NVME_REG_PMRSWTP:
7894 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
7895 "invalid write to PMRSWTP register, ignored");
7896 return;
7897 case NVME_REG_PMRMSCL:
7898 if (!NVME_CAP_PMRS(cap)) {
7899 return;
7900 }
7901
7902 stl_le_p(&n->bar.pmrmscl, data);
7903 n->pmr.cmse = false;
7904
7905 if (NVME_PMRMSCL_CMSE(data)) {
7906 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
7907 hwaddr cba = pmrmscu << 32 |
7908 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
7909 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
7910 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
7911 stl_le_p(&n->bar.pmrsts, pmrsts);
7912 return;
7913 }
7914
7915 n->pmr.cmse = true;
7916 n->pmr.cba = cba;
7917 }
7918
7919 return;
7920 case NVME_REG_PMRMSCU:
7921 if (!NVME_CAP_PMRS(cap)) {
7922 return;
7923 }
7924
7925 stl_le_p(&n->bar.pmrmscu, data);
7926 return;
7927 default:
7928 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
7929 "invalid MMIO write,"
7930 " offset=0x%"PRIx64", data=%"PRIx64"",
7931 offset, data);
7932 break;
7933 }
7934 }
7935
nvme_mmio_read(void * opaque,hwaddr addr,unsigned size)7936 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
7937 {
7938 NvmeCtrl *n = (NvmeCtrl *)opaque;
7939 uint8_t *ptr = (uint8_t *)&n->bar;
7940
7941 trace_pci_nvme_mmio_read(addr, size);
7942
7943 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
7944 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
7945 "MMIO read not 32-bit aligned,"
7946 " offset=0x%"PRIx64"", addr);
7947 /* should RAZ, fall through for now */
7948 } else if (unlikely(size < sizeof(uint32_t))) {
7949 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
7950 "MMIO read smaller than 32-bits,"
7951 " offset=0x%"PRIx64"", addr);
7952 /* should RAZ, fall through for now */
7953 }
7954
7955 if (addr > sizeof(n->bar) - size) {
7956 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
7957 "MMIO read beyond last register,"
7958 " offset=0x%"PRIx64", returning 0", addr);
7959
7960 return 0;
7961 }
7962
7963 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
7964 addr != NVME_REG_CSTS) {
7965 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
7966 return 0;
7967 }
7968
7969 /*
7970 * When PMRWBM bit 1 is set then read from
7971 * from PMRSTS should ensure prior writes
7972 * made it to persistent media
7973 */
7974 if (addr == NVME_REG_PMRSTS &&
7975 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
7976 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7977 }
7978
7979 return ldn_le_p(ptr + addr, size);
7980 }
7981
nvme_process_db(NvmeCtrl * n,hwaddr addr,int val)7982 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
7983 {
7984 PCIDevice *pci = PCI_DEVICE(n);
7985 uint32_t qid;
7986
7987 if (unlikely(addr & ((1 << 2) - 1))) {
7988 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
7989 "doorbell write not 32-bit aligned,"
7990 " offset=0x%"PRIx64", ignoring", addr);
7991 return;
7992 }
7993
7994 if (((addr - 0x1000) >> 2) & 1) {
7995 /* Completion queue doorbell write */
7996
7997 uint16_t new_head = val & 0xffff;
7998 NvmeCQueue *cq;
7999
8000 qid = (addr - (0x1000 + (1 << 2))) >> 3;
8001 if (unlikely(nvme_check_cqid(n, qid))) {
8002 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
8003 "completion queue doorbell write"
8004 " for nonexistent queue,"
8005 " sqid=%"PRIu32", ignoring", qid);
8006
8007 /*
8008 * NVM Express v1.3d, Section 4.1 state: "If host software writes
8009 * an invalid value to the Submission Queue Tail Doorbell or
8010 * Completion Queue Head Doorbell register and an Asynchronous Event
8011 * Request command is outstanding, then an asynchronous event is
8012 * posted to the Admin Completion Queue with a status code of
8013 * Invalid Doorbell Write Value."
8014 *
8015 * Also note that the spec includes the "Invalid Doorbell Register"
8016 * status code, but nowhere does it specify when to use it.
8017 * However, it seems reasonable to use it here in a similar
8018 * fashion.
8019 */
8020 if (n->outstanding_aers) {
8021 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8022 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
8023 NVME_LOG_ERROR_INFO);
8024 }
8025
8026 return;
8027 }
8028
8029 cq = n->cq[qid];
8030 if (unlikely(new_head >= cq->size)) {
8031 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
8032 "completion queue doorbell write value"
8033 " beyond queue size, sqid=%"PRIu32","
8034 " new_head=%"PRIu16", ignoring",
8035 qid, new_head);
8036
8037 if (n->outstanding_aers) {
8038 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8039 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
8040 NVME_LOG_ERROR_INFO);
8041 }
8042
8043 return;
8044 }
8045
8046 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
8047
8048 /* scheduled deferred cqe posting if queue was previously full */
8049 if (nvme_cq_full(cq)) {
8050 qemu_bh_schedule(cq->bh);
8051 }
8052
8053 cq->head = new_head;
8054 if (!qid && n->dbbuf_enabled) {
8055 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
8056 }
8057
8058 if (cq->tail == cq->head) {
8059 if (cq->irq_enabled) {
8060 n->cq_pending--;
8061 }
8062
8063 nvme_irq_deassert(n, cq);
8064 }
8065 } else {
8066 /* Submission queue doorbell write */
8067
8068 uint16_t new_tail = val & 0xffff;
8069 NvmeSQueue *sq;
8070
8071 qid = (addr - 0x1000) >> 3;
8072 if (unlikely(nvme_check_sqid(n, qid))) {
8073 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
8074 "submission queue doorbell write"
8075 " for nonexistent queue,"
8076 " sqid=%"PRIu32", ignoring", qid);
8077
8078 if (n->outstanding_aers) {
8079 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8080 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
8081 NVME_LOG_ERROR_INFO);
8082 }
8083
8084 return;
8085 }
8086
8087 sq = n->sq[qid];
8088 if (unlikely(new_tail >= sq->size)) {
8089 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
8090 "submission queue doorbell write value"
8091 " beyond queue size, sqid=%"PRIu32","
8092 " new_tail=%"PRIu16", ignoring",
8093 qid, new_tail);
8094
8095 if (n->outstanding_aers) {
8096 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8097 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
8098 NVME_LOG_ERROR_INFO);
8099 }
8100
8101 return;
8102 }
8103
8104 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
8105
8106 sq->tail = new_tail;
8107 if (!qid && n->dbbuf_enabled) {
8108 /*
8109 * The spec states "the host shall also update the controller's
8110 * corresponding doorbell property to match the value of that entry
8111 * in the Shadow Doorbell buffer."
8112 *
8113 * Since this context is currently a VM trap, we can safely enforce
8114 * the requirement from the device side in case the host is
8115 * misbehaving.
8116 *
8117 * Note, we shouldn't have to do this, but various drivers
8118 * including ones that run on Linux, are not updating Admin Queues,
8119 * so we can't trust reading it for an appropriate sq tail.
8120 */
8121 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
8122 }
8123
8124 qemu_bh_schedule(sq->bh);
8125 }
8126 }
8127
nvme_mmio_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)8128 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
8129 unsigned size)
8130 {
8131 NvmeCtrl *n = (NvmeCtrl *)opaque;
8132
8133 trace_pci_nvme_mmio_write(addr, data, size);
8134
8135 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
8136 addr != NVME_REG_CSTS) {
8137 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
8138 return;
8139 }
8140
8141 if (addr < sizeof(n->bar)) {
8142 nvme_write_bar(n, addr, data, size);
8143 } else {
8144 nvme_process_db(n, addr, data);
8145 }
8146 }
8147
8148 static const MemoryRegionOps nvme_mmio_ops = {
8149 .read = nvme_mmio_read,
8150 .write = nvme_mmio_write,
8151 .endianness = DEVICE_LITTLE_ENDIAN,
8152 .impl = {
8153 .min_access_size = 2,
8154 .max_access_size = 8,
8155 },
8156 };
8157
nvme_cmb_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)8158 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
8159 unsigned size)
8160 {
8161 NvmeCtrl *n = (NvmeCtrl *)opaque;
8162 stn_le_p(&n->cmb.buf[addr], size, data);
8163 }
8164
nvme_cmb_read(void * opaque,hwaddr addr,unsigned size)8165 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
8166 {
8167 NvmeCtrl *n = (NvmeCtrl *)opaque;
8168 return ldn_le_p(&n->cmb.buf[addr], size);
8169 }
8170
8171 static const MemoryRegionOps nvme_cmb_ops = {
8172 .read = nvme_cmb_read,
8173 .write = nvme_cmb_write,
8174 .endianness = DEVICE_LITTLE_ENDIAN,
8175 .impl = {
8176 .min_access_size = 1,
8177 .max_access_size = 8,
8178 },
8179 };
8180
nvme_check_params(NvmeCtrl * n,Error ** errp)8181 static bool nvme_check_params(NvmeCtrl *n, Error **errp)
8182 {
8183 NvmeParams *params = &n->params;
8184
8185 if (params->num_queues) {
8186 warn_report("num_queues is deprecated; please use max_ioqpairs "
8187 "instead");
8188
8189 params->max_ioqpairs = params->num_queues - 1;
8190 }
8191
8192 if (n->namespace.blkconf.blk && n->subsys) {
8193 error_setg(errp, "subsystem support is unavailable with legacy "
8194 "namespace ('drive' property)");
8195 return false;
8196 }
8197
8198 if (params->max_ioqpairs < 1 ||
8199 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
8200 error_setg(errp, "max_ioqpairs must be between 1 and %d",
8201 NVME_MAX_IOQPAIRS);
8202 return false;
8203 }
8204
8205 if (params->msix_qsize < 1 ||
8206 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
8207 error_setg(errp, "msix_qsize must be between 1 and %d",
8208 PCI_MSIX_FLAGS_QSIZE + 1);
8209 return false;
8210 }
8211
8212 if (!params->serial) {
8213 error_setg(errp, "serial property not set");
8214 return false;
8215 }
8216
8217 if (params->mqes < 1) {
8218 error_setg(errp, "mqes property cannot be less than 1");
8219 return false;
8220 }
8221
8222 if (n->pmr.dev) {
8223 if (params->msix_exclusive_bar) {
8224 error_setg(errp, "not enough BARs available to enable PMR");
8225 return false;
8226 }
8227
8228 if (host_memory_backend_is_mapped(n->pmr.dev)) {
8229 error_setg(errp, "can't use already busy memdev: %s",
8230 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
8231 return false;
8232 }
8233
8234 if (!is_power_of_2(n->pmr.dev->size)) {
8235 error_setg(errp, "pmr backend size needs to be power of 2 in size");
8236 return false;
8237 }
8238
8239 host_memory_backend_set_mapped(n->pmr.dev, true);
8240 }
8241
8242 if (n->params.zasl > n->params.mdts) {
8243 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
8244 "than or equal to mdts (Maximum Data Transfer Size)");
8245 return false;
8246 }
8247
8248 if (!n->params.vsl) {
8249 error_setg(errp, "vsl must be non-zero");
8250 return false;
8251 }
8252
8253 if (params->sriov_max_vfs) {
8254 if (!n->subsys) {
8255 error_setg(errp, "subsystem is required for the use of SR-IOV");
8256 return false;
8257 }
8258
8259 if (params->cmb_size_mb) {
8260 error_setg(errp, "CMB is not supported with SR-IOV");
8261 return false;
8262 }
8263
8264 if (n->pmr.dev) {
8265 error_setg(errp, "PMR is not supported with SR-IOV");
8266 return false;
8267 }
8268
8269 if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
8270 error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
8271 " must be set for the use of SR-IOV");
8272 return false;
8273 }
8274
8275 if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
8276 error_setg(errp, "sriov_vq_flexible must be greater than or equal"
8277 " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2);
8278 return false;
8279 }
8280
8281 if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
8282 error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be"
8283 " greater than or equal to 2");
8284 return false;
8285 }
8286
8287 if (params->sriov_vi_flexible < params->sriov_max_vfs) {
8288 error_setg(errp, "sriov_vi_flexible must be greater than or equal"
8289 " to %d (sriov_max_vfs)", params->sriov_max_vfs);
8290 return false;
8291 }
8292
8293 if (params->msix_qsize < params->sriov_vi_flexible + 1) {
8294 error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be"
8295 " greater than or equal to 1");
8296 return false;
8297 }
8298
8299 if (params->sriov_max_vi_per_vf &&
8300 (params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
8301 error_setg(errp, "sriov_max_vi_per_vf must meet:"
8302 " (sriov_max_vi_per_vf - 1) %% %d == 0 and"
8303 " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY);
8304 return false;
8305 }
8306
8307 if (params->sriov_max_vq_per_vf &&
8308 (params->sriov_max_vq_per_vf < 2 ||
8309 (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
8310 error_setg(errp, "sriov_max_vq_per_vf must meet:"
8311 " (sriov_max_vq_per_vf - 1) %% %d == 0 and"
8312 " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY);
8313 return false;
8314 }
8315 }
8316
8317 return true;
8318 }
8319
nvme_init_state(NvmeCtrl * n)8320 static void nvme_init_state(NvmeCtrl *n)
8321 {
8322 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8323 NvmeSecCtrlEntry *list = n->sec_ctrl_list;
8324 NvmeSecCtrlEntry *sctrl;
8325 PCIDevice *pci = PCI_DEVICE(n);
8326 NvmeAtomic *atomic = &n->atomic;
8327 NvmeIdCtrl *id = &n->id_ctrl;
8328 uint8_t max_vfs;
8329 int i;
8330
8331 if (pci_is_vf(pci)) {
8332 sctrl = nvme_sctrl(n);
8333 max_vfs = 0;
8334 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
8335 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
8336 } else {
8337 max_vfs = n->params.sriov_max_vfs;
8338 n->conf_ioqpairs = n->params.max_ioqpairs;
8339 n->conf_msix_qsize = n->params.msix_qsize;
8340 }
8341
8342 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
8343 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
8344 n->temperature = NVME_TEMPERATURE;
8345 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
8346 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
8347 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
8348 QTAILQ_INIT(&n->aer_queue);
8349
8350 n->nr_sec_ctrls = max_vfs;
8351 for (i = 0; i < max_vfs; i++) {
8352 sctrl = &list[i];
8353 sctrl->pcid = cpu_to_le16(n->cntlid);
8354 sctrl->vfn = cpu_to_le16(i + 1);
8355 }
8356
8357 cap->cntlid = cpu_to_le16(n->cntlid);
8358 cap->crt = NVME_CRT_VQ | NVME_CRT_VI;
8359
8360 if (pci_is_vf(pci)) {
8361 cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs);
8362 } else {
8363 cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs -
8364 n->params.sriov_vq_flexible);
8365 cap->vqfrt = cpu_to_le32(n->params.sriov_vq_flexible);
8366 cap->vqrfap = cap->vqfrt;
8367 cap->vqgran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8368 cap->vqfrsm = n->params.sriov_max_vq_per_vf ?
8369 cpu_to_le16(n->params.sriov_max_vq_per_vf) :
8370 cap->vqfrt / MAX(max_vfs, 1);
8371 }
8372
8373 if (pci_is_vf(pci)) {
8374 cap->viprt = cpu_to_le16(n->conf_msix_qsize);
8375 } else {
8376 cap->viprt = cpu_to_le16(n->params.msix_qsize -
8377 n->params.sriov_vi_flexible);
8378 cap->vifrt = cpu_to_le32(n->params.sriov_vi_flexible);
8379 cap->virfap = cap->vifrt;
8380 cap->vigran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8381 cap->vifrsm = n->params.sriov_max_vi_per_vf ?
8382 cpu_to_le16(n->params.sriov_max_vi_per_vf) :
8383 cap->vifrt / MAX(max_vfs, 1);
8384 }
8385
8386 /* Atomic Write */
8387 id->awun = cpu_to_le16(n->params.atomic_awun);
8388 id->awupf = cpu_to_le16(n->params.atomic_awupf);
8389 n->dn = n->params.atomic_dn;
8390
8391 if (id->awun || id->awupf) {
8392 if (id->awupf > id->awun) {
8393 id->awupf = 0;
8394 }
8395
8396 if (n->dn) {
8397 atomic->atomic_max_write_size = id->awupf + 1;
8398 } else {
8399 atomic->atomic_max_write_size = id->awun + 1;
8400 }
8401
8402 if (atomic->atomic_max_write_size == 1) {
8403 atomic->atomic_writes = 0;
8404 } else {
8405 atomic->atomic_writes = 1;
8406 }
8407 }
8408 }
8409
nvme_init_cmb(NvmeCtrl * n,PCIDevice * pci_dev)8410 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
8411 {
8412 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
8413 uint64_t cap = ldq_le_p(&n->bar.cap);
8414
8415 n->cmb.buf = g_malloc0(cmb_size);
8416 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
8417 "nvme-cmb", cmb_size);
8418 pci_register_bar(pci_dev, NVME_CMB_BIR,
8419 PCI_BASE_ADDRESS_SPACE_MEMORY |
8420 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8421 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
8422
8423 NVME_CAP_SET_CMBS(cap, 1);
8424 stq_le_p(&n->bar.cap, cap);
8425
8426 if (n->params.legacy_cmb) {
8427 nvme_cmb_enable_regs(n);
8428 n->cmb.cmse = true;
8429 }
8430 }
8431
nvme_init_pmr(NvmeCtrl * n,PCIDevice * pci_dev)8432 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
8433 {
8434 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
8435
8436 NVME_PMRCAP_SET_RDS(pmrcap, 1);
8437 NVME_PMRCAP_SET_WDS(pmrcap, 1);
8438 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
8439 /* Turn on bit 1 support */
8440 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
8441 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
8442 stl_le_p(&n->bar.pmrcap, pmrcap);
8443
8444 pci_register_bar(pci_dev, NVME_PMR_BIR,
8445 PCI_BASE_ADDRESS_SPACE_MEMORY |
8446 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8447 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
8448
8449 memory_region_set_enabled(&n->pmr.dev->mr, false);
8450 }
8451
nvme_mbar_size(unsigned total_queues,unsigned total_irqs,unsigned * msix_table_offset,unsigned * msix_pba_offset)8452 static uint64_t nvme_mbar_size(unsigned total_queues, unsigned total_irqs,
8453 unsigned *msix_table_offset,
8454 unsigned *msix_pba_offset)
8455 {
8456 uint64_t bar_size, msix_table_size;
8457
8458 bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
8459
8460 if (total_irqs == 0) {
8461 goto out;
8462 }
8463
8464 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8465
8466 if (msix_table_offset) {
8467 *msix_table_offset = bar_size;
8468 }
8469
8470 msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
8471 bar_size += msix_table_size;
8472 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8473
8474 if (msix_pba_offset) {
8475 *msix_pba_offset = bar_size;
8476 }
8477
8478 bar_size += QEMU_ALIGN_UP(total_irqs, 64) / 8;
8479
8480 out:
8481 return pow2ceil(bar_size);
8482 }
8483
nvme_init_sriov(NvmeCtrl * n,PCIDevice * pci_dev,uint16_t offset)8484 static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset)
8485 {
8486 uint16_t vf_dev_id = n->params.use_intel_id ?
8487 PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
8488 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8489 uint64_t bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm),
8490 le16_to_cpu(cap->vifrsm),
8491 NULL, NULL);
8492
8493 pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
8494 n->params.sriov_max_vfs, n->params.sriov_max_vfs,
8495 NVME_VF_OFFSET, NVME_VF_STRIDE);
8496
8497 pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8498 PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
8499 }
8500
nvme_add_pm_capability(PCIDevice * pci_dev,uint8_t offset)8501 static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
8502 {
8503 Error *err = NULL;
8504 int ret;
8505
8506 ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
8507 PCI_PM_SIZEOF, &err);
8508 if (err) {
8509 error_report_err(err);
8510 return ret;
8511 }
8512
8513 pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
8514 PCI_PM_CAP_VER_1_2);
8515 pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
8516 PCI_PM_CTRL_NO_SOFT_RESET);
8517 pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
8518 PCI_PM_CTRL_STATE_MASK);
8519
8520 return 0;
8521 }
8522
pcie_doe_spdm_rsp(DOECap * doe_cap)8523 static bool pcie_doe_spdm_rsp(DOECap *doe_cap)
8524 {
8525 void *req = pcie_doe_get_write_mbox_ptr(doe_cap);
8526 uint32_t req_len = pcie_doe_get_obj_len(req) * 4;
8527 void *rsp = doe_cap->read_mbox;
8528 uint32_t rsp_len = SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE;
8529
8530 uint32_t recvd = spdm_socket_rsp(doe_cap->spdm_socket,
8531 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE,
8532 req, req_len, rsp, rsp_len);
8533 doe_cap->read_mbox_len += DIV_ROUND_UP(recvd, 4);
8534
8535 return recvd != 0;
8536 }
8537
8538 static DOEProtocol doe_spdm_prot[] = {
8539 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_CMA, pcie_doe_spdm_rsp },
8540 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_SECURED_CMA, pcie_doe_spdm_rsp },
8541 { }
8542 };
8543
nvme_init_pci(NvmeCtrl * n,PCIDevice * pci_dev,Error ** errp)8544 static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
8545 {
8546 ERRP_GUARD();
8547 uint8_t *pci_conf = pci_dev->config;
8548 uint64_t bar_size;
8549 unsigned msix_table_offset = 0, msix_pba_offset = 0;
8550 unsigned nr_vectors;
8551 int ret;
8552
8553 pci_conf[PCI_INTERRUPT_PIN] = pci_is_vf(pci_dev) ? 0 : 1;
8554 pci_config_set_prog_interface(pci_conf, 0x2);
8555
8556 if (n->params.use_intel_id) {
8557 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
8558 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
8559 } else {
8560 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
8561 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
8562 }
8563
8564 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
8565 nvme_add_pm_capability(pci_dev, 0x60);
8566 pcie_endpoint_cap_init(pci_dev, 0x80);
8567 pcie_cap_flr_init(pci_dev);
8568 if (n->params.sriov_max_vfs) {
8569 pcie_ari_init(pci_dev, 0x100);
8570 }
8571
8572 if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) {
8573 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, 0, NULL, NULL);
8574 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8575 bar_size);
8576 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8577 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
8578 ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp);
8579 } else {
8580 assert(n->params.msix_qsize >= 1);
8581
8582 /* add one to max_ioqpairs to account for the admin queue pair */
8583 if (!pci_is_vf(pci_dev)) {
8584 nr_vectors = n->params.msix_qsize;
8585 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1,
8586 nr_vectors, &msix_table_offset,
8587 &msix_pba_offset);
8588 } else {
8589 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8590 NvmePriCtrlCap *cap = &pn->pri_ctrl_cap;
8591
8592 nr_vectors = le16_to_cpu(cap->vifrsm);
8593 bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm), nr_vectors,
8594 &msix_table_offset, &msix_pba_offset);
8595 }
8596
8597 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
8598 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8599 msix_table_offset);
8600 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
8601
8602 if (pci_is_vf(pci_dev)) {
8603 pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
8604 } else {
8605 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8606 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
8607 }
8608
8609 ret = msix_init(pci_dev, nr_vectors,
8610 &n->bar0, 0, msix_table_offset,
8611 &n->bar0, 0, msix_pba_offset, 0, errp);
8612 }
8613
8614 if (ret == -ENOTSUP) {
8615 /* report that msix is not supported, but do not error out */
8616 warn_report_err(*errp);
8617 *errp = NULL;
8618 } else if (ret < 0) {
8619 /* propagate error to caller */
8620 return false;
8621 }
8622
8623 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
8624
8625 pcie_cap_deverr_init(pci_dev);
8626
8627 /* DOE Initialisation */
8628 if (pci_dev->spdm_port) {
8629 uint16_t doe_offset = n->params.sriov_max_vfs ?
8630 PCI_CONFIG_SPACE_SIZE + PCI_ARI_SIZEOF
8631 : PCI_CONFIG_SPACE_SIZE;
8632
8633 pcie_doe_init(pci_dev, &pci_dev->doe_spdm, doe_offset,
8634 doe_spdm_prot, true, 0);
8635
8636 pci_dev->doe_spdm.spdm_socket = spdm_socket_connect(pci_dev->spdm_port,
8637 errp);
8638
8639 if (pci_dev->doe_spdm.spdm_socket < 0) {
8640 return false;
8641 }
8642 }
8643
8644 if (n->params.cmb_size_mb) {
8645 nvme_init_cmb(n, pci_dev);
8646 }
8647
8648 if (n->pmr.dev) {
8649 nvme_init_pmr(n, pci_dev);
8650 }
8651
8652 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
8653 nvme_init_sriov(n, pci_dev, 0x120);
8654 }
8655
8656 return true;
8657 }
8658
nvme_init_subnqn(NvmeCtrl * n)8659 static void nvme_init_subnqn(NvmeCtrl *n)
8660 {
8661 NvmeSubsystem *subsys = n->subsys;
8662 NvmeIdCtrl *id = &n->id_ctrl;
8663
8664 if (!subsys) {
8665 snprintf((char *)id->subnqn, sizeof(id->subnqn),
8666 "nqn.2019-08.org.qemu:%s", n->params.serial);
8667 } else {
8668 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
8669 }
8670 }
8671
nvme_init_ctrl(NvmeCtrl * n,PCIDevice * pci_dev)8672 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
8673 {
8674 NvmeIdCtrl *id = &n->id_ctrl;
8675 uint8_t *pci_conf = pci_dev->config;
8676 uint64_t cap = ldq_le_p(&n->bar.cap);
8677 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
8678 uint32_t ctratt;
8679
8680 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
8681 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
8682 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
8683 strpadcpy((char *)id->fr, sizeof(id->fr), QEMU_VERSION, ' ');
8684 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
8685
8686 id->cntlid = cpu_to_le16(n->cntlid);
8687
8688 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
8689
8690 ctratt = NVME_CTRATT_ELBAS;
8691 if (n->params.ctratt.mem) {
8692 ctratt |= NVME_CTRATT_MEM;
8693 }
8694
8695 id->rab = 6;
8696
8697 if (n->params.use_intel_id) {
8698 id->ieee[0] = 0xb3;
8699 id->ieee[1] = 0x02;
8700 id->ieee[2] = 0x00;
8701 } else {
8702 id->ieee[0] = 0x00;
8703 id->ieee[1] = 0x54;
8704 id->ieee[2] = 0x52;
8705 }
8706
8707 id->mdts = n->params.mdts;
8708 id->ver = cpu_to_le32(NVME_SPEC_VER);
8709 id->oacs =
8710 cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF |
8711 NVME_OACS_DIRECTIVES);
8712 id->cntrltype = 0x1;
8713
8714 /*
8715 * Because the controller always completes the Abort command immediately,
8716 * there can never be more than one concurrently executing Abort command,
8717 * so this value is never used for anything. Note that there can easily be
8718 * many Abort commands in the queues, but they are not considered
8719 * "executing" until processed by nvme_abort.
8720 *
8721 * The specification recommends a value of 3 for Abort Command Limit (four
8722 * concurrently outstanding Abort commands), so lets use that though it is
8723 * inconsequential.
8724 */
8725 id->acl = 3;
8726 id->aerl = n->params.aerl;
8727 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
8728 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
8729
8730 /* recommended default value (~70 C) */
8731 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
8732 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
8733
8734 id->sqes = (NVME_SQES << 4) | NVME_SQES;
8735 id->cqes = (NVME_CQES << 4) | NVME_CQES;
8736 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
8737 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
8738 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
8739 NVME_ONCS_COMPARE | NVME_ONCS_COPY |
8740 NVME_ONCS_NVMCSA | NVME_ONCS_NVMAFC);
8741
8742 /*
8743 * NOTE: If this device ever supports a command set that does NOT use 0x0
8744 * as a Flush-equivalent operation, support for the broadcast NSID in Flush
8745 * should probably be removed.
8746 *
8747 * See comment in nvme_io_cmd.
8748 */
8749 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
8750
8751 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1 |
8752 NVME_OCFS_COPY_FORMAT_2 | NVME_OCFS_COPY_FORMAT_3);
8753 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
8754 NVME_CTRL_SGLS_MPTR_SGL);
8755
8756 nvme_init_subnqn(n);
8757
8758 id->psd[0].mp = cpu_to_le16(0x9c4);
8759 id->psd[0].enlat = cpu_to_le32(0x10);
8760 id->psd[0].exlat = cpu_to_le32(0x4);
8761
8762 if (n->subsys) {
8763 id->cmic |= NVME_CMIC_MULTI_CTRL;
8764 ctratt |= NVME_CTRATT_ENDGRPS;
8765
8766 id->endgidmax = cpu_to_le16(0x1);
8767
8768 if (n->subsys->endgrp.fdp.enabled) {
8769 ctratt |= NVME_CTRATT_FDPS;
8770 }
8771 }
8772
8773 id->ctratt = cpu_to_le32(ctratt);
8774
8775 NVME_CAP_SET_MQES(cap, n->params.mqes);
8776 NVME_CAP_SET_CQR(cap, 1);
8777 NVME_CAP_SET_TO(cap, 0xf);
8778 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
8779 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
8780 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
8781 NVME_CAP_SET_MPSMAX(cap, 4);
8782 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
8783 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
8784 stq_le_p(&n->bar.cap, cap);
8785
8786 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
8787 n->bar.intmc = n->bar.intms = 0;
8788
8789 if (pci_is_vf(pci_dev) && !sctrl->scs) {
8790 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
8791 }
8792 }
8793
nvme_init_subsys(NvmeCtrl * n,Error ** errp)8794 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
8795 {
8796 int cntlid;
8797
8798 if (!n->subsys) {
8799 return 0;
8800 }
8801
8802 cntlid = nvme_subsys_register_ctrl(n, errp);
8803 if (cntlid < 0) {
8804 return -1;
8805 }
8806
8807 n->cntlid = cntlid;
8808
8809 return 0;
8810 }
8811
nvme_attach_ns(NvmeCtrl * n,NvmeNamespace * ns)8812 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
8813 {
8814 uint32_t nsid = ns->params.nsid;
8815 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
8816
8817 n->namespaces[nsid] = ns;
8818 ns->attached++;
8819
8820 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
8821 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
8822 }
8823
nvme_realize(PCIDevice * pci_dev,Error ** errp)8824 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
8825 {
8826 NvmeCtrl *n = NVME(pci_dev);
8827 DeviceState *dev = DEVICE(pci_dev);
8828 NvmeNamespace *ns;
8829 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8830
8831 if (pci_is_vf(pci_dev)) {
8832 /*
8833 * VFs derive settings from the parent. PF's lifespan exceeds
8834 * that of VF's.
8835 */
8836 memcpy(&n->params, &pn->params, sizeof(NvmeParams));
8837
8838 /*
8839 * Set PF's serial value to a new string memory to prevent 'serial'
8840 * property object release of PF when a VF is removed from the system.
8841 */
8842 n->params.serial = g_strdup(pn->params.serial);
8843 n->subsys = pn->subsys;
8844
8845 /*
8846 * Assigning this link (strong link) causes an `object_unref` later in
8847 * `object_release_link_property`. Increment the refcount to balance
8848 * this out.
8849 */
8850 object_ref(OBJECT(pn->subsys));
8851 }
8852
8853 if (!nvme_check_params(n, errp)) {
8854 return;
8855 }
8856
8857 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
8858
8859 if (nvme_init_subsys(n, errp)) {
8860 return;
8861 }
8862 nvme_init_state(n);
8863 if (!nvme_init_pci(n, pci_dev, errp)) {
8864 return;
8865 }
8866 nvme_init_ctrl(n, pci_dev);
8867
8868 /* setup a namespace if the controller drive property was given */
8869 if (n->namespace.blkconf.blk) {
8870 ns = &n->namespace;
8871 ns->params.nsid = 1;
8872
8873 if (nvme_ns_setup(ns, errp)) {
8874 return;
8875 }
8876
8877 nvme_attach_ns(n, ns);
8878 }
8879 }
8880
nvme_exit(PCIDevice * pci_dev)8881 static void nvme_exit(PCIDevice *pci_dev)
8882 {
8883 NvmeCtrl *n = NVME(pci_dev);
8884 NvmeNamespace *ns;
8885 int i;
8886
8887 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
8888
8889 if (n->subsys) {
8890 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
8891 ns = nvme_ns(n, i);
8892 if (ns) {
8893 ns->attached--;
8894 }
8895 }
8896
8897 nvme_subsys_unregister_ctrl(n->subsys, n);
8898 }
8899
8900 g_free(n->cq);
8901 g_free(n->sq);
8902 g_free(n->aer_reqs);
8903
8904 if (n->params.cmb_size_mb) {
8905 g_free(n->cmb.buf);
8906 }
8907
8908 if (pci_dev->doe_spdm.spdm_socket > 0) {
8909 spdm_socket_close(pci_dev->doe_spdm.spdm_socket,
8910 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE);
8911 }
8912
8913 if (n->pmr.dev) {
8914 host_memory_backend_set_mapped(n->pmr.dev, false);
8915 }
8916
8917 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
8918 pcie_sriov_pf_exit(pci_dev);
8919 }
8920
8921 if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) {
8922 msix_uninit_exclusive_bar(pci_dev);
8923 } else {
8924 msix_uninit(pci_dev, &n->bar0, &n->bar0);
8925 }
8926
8927 memory_region_del_subregion(&n->bar0, &n->iomem);
8928 }
8929
8930 static Property nvme_props[] = {
8931 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
8932 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
8933 HostMemoryBackend *),
8934 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
8935 NvmeSubsystem *),
8936 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
8937 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
8938 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
8939 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
8940 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
8941 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
8942 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
8943 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
8944 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
8945 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
8946 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
8947 DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
8948 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
8949 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
8950 params.auto_transition_zones, true),
8951 DEFINE_PROP_UINT16("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
8952 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
8953 params.sriov_vq_flexible, 0),
8954 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
8955 params.sriov_vi_flexible, 0),
8956 DEFINE_PROP_UINT32("sriov_max_vi_per_vf", NvmeCtrl,
8957 params.sriov_max_vi_per_vf, 0),
8958 DEFINE_PROP_UINT32("sriov_max_vq_per_vf", NvmeCtrl,
8959 params.sriov_max_vq_per_vf, 0),
8960 DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar,
8961 false),
8962 DEFINE_PROP_UINT16("mqes", NvmeCtrl, params.mqes, 0x7ff),
8963 DEFINE_PROP_UINT16("spdm_port", PCIDevice, spdm_port, 0),
8964 DEFINE_PROP_BOOL("ctratt.mem", NvmeCtrl, params.ctratt.mem, false),
8965 DEFINE_PROP_BOOL("atomic.dn", NvmeCtrl, params.atomic_dn, 0),
8966 DEFINE_PROP_UINT16("atomic.awun", NvmeCtrl, params.atomic_awun, 0),
8967 DEFINE_PROP_UINT16("atomic.awupf", NvmeCtrl, params.atomic_awupf, 0),
8968 DEFINE_PROP_END_OF_LIST(),
8969 };
8970
nvme_get_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)8971 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
8972 void *opaque, Error **errp)
8973 {
8974 NvmeCtrl *n = NVME(obj);
8975 uint8_t value = n->smart_critical_warning;
8976
8977 visit_type_uint8(v, name, &value, errp);
8978 }
8979
nvme_set_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)8980 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
8981 void *opaque, Error **errp)
8982 {
8983 NvmeCtrl *n = NVME(obj);
8984 uint8_t value, old_value, cap = 0, index, event;
8985
8986 if (!visit_type_uint8(v, name, &value, errp)) {
8987 return;
8988 }
8989
8990 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
8991 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
8992 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
8993 cap |= NVME_SMART_PMR_UNRELIABLE;
8994 }
8995
8996 if ((value & cap) != value) {
8997 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
8998 value & ~cap);
8999 return;
9000 }
9001
9002 old_value = n->smart_critical_warning;
9003 n->smart_critical_warning = value;
9004
9005 /* only inject new bits of smart critical warning */
9006 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
9007 event = 1 << index;
9008 if (value & ~old_value & event)
9009 nvme_smart_event(n, event);
9010 }
9011 }
9012
nvme_pci_reset(DeviceState * qdev)9013 static void nvme_pci_reset(DeviceState *qdev)
9014 {
9015 PCIDevice *pci_dev = PCI_DEVICE(qdev);
9016 NvmeCtrl *n = NVME(pci_dev);
9017
9018 trace_pci_nvme_pci_reset();
9019 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
9020 }
9021
nvme_sriov_post_write_config(PCIDevice * dev,uint16_t old_num_vfs)9022 static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs)
9023 {
9024 NvmeCtrl *n = NVME(dev);
9025 NvmeSecCtrlEntry *sctrl;
9026 int i;
9027
9028 for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) {
9029 sctrl = &n->sec_ctrl_list[i];
9030 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
9031 }
9032 }
9033
nvme_pci_write_config(PCIDevice * dev,uint32_t address,uint32_t val,int len)9034 static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
9035 uint32_t val, int len)
9036 {
9037 uint16_t old_num_vfs = pcie_sriov_num_vfs(dev);
9038
9039 if (pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
9040 pcie_doe_write_config(&dev->doe_spdm, address, val, len);
9041 }
9042 pci_default_write_config(dev, address, val, len);
9043 pcie_cap_flr_write_config(dev, address, val, len);
9044 nvme_sriov_post_write_config(dev, old_num_vfs);
9045 }
9046
nvme_pci_read_config(PCIDevice * dev,uint32_t address,int len)9047 static uint32_t nvme_pci_read_config(PCIDevice *dev, uint32_t address, int len)
9048 {
9049 uint32_t val;
9050 if (dev->spdm_port && pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
9051 if (pcie_doe_read_config(&dev->doe_spdm, address, len, &val)) {
9052 return val;
9053 }
9054 }
9055 return pci_default_read_config(dev, address, len);
9056 }
9057
9058 static const VMStateDescription nvme_vmstate = {
9059 .name = "nvme",
9060 .unmigratable = 1,
9061 };
9062
nvme_class_init(ObjectClass * oc,void * data)9063 static void nvme_class_init(ObjectClass *oc, void *data)
9064 {
9065 DeviceClass *dc = DEVICE_CLASS(oc);
9066 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
9067
9068 pc->realize = nvme_realize;
9069 pc->config_write = nvme_pci_write_config;
9070 pc->config_read = nvme_pci_read_config;
9071 pc->exit = nvme_exit;
9072 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
9073 pc->revision = 2;
9074
9075 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
9076 dc->desc = "Non-Volatile Memory Express";
9077 device_class_set_props(dc, nvme_props);
9078 dc->vmsd = &nvme_vmstate;
9079 device_class_set_legacy_reset(dc, nvme_pci_reset);
9080 }
9081
nvme_instance_init(Object * obj)9082 static void nvme_instance_init(Object *obj)
9083 {
9084 NvmeCtrl *n = NVME(obj);
9085
9086 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
9087 "bootindex", "/namespace@1,0",
9088 DEVICE(obj));
9089
9090 object_property_add(obj, "smart_critical_warning", "uint8",
9091 nvme_get_smart_warning,
9092 nvme_set_smart_warning, NULL, NULL);
9093 }
9094
9095 static const TypeInfo nvme_info = {
9096 .name = TYPE_NVME,
9097 .parent = TYPE_PCI_DEVICE,
9098 .instance_size = sizeof(NvmeCtrl),
9099 .instance_init = nvme_instance_init,
9100 .class_init = nvme_class_init,
9101 .interfaces = (InterfaceInfo[]) {
9102 { INTERFACE_PCIE_DEVICE },
9103 { }
9104 },
9105 };
9106
9107 static const TypeInfo nvme_bus_info = {
9108 .name = TYPE_NVME_BUS,
9109 .parent = TYPE_BUS,
9110 .instance_size = sizeof(NvmeBus),
9111 };
9112
nvme_register_types(void)9113 static void nvme_register_types(void)
9114 {
9115 type_register_static(&nvme_info);
9116 type_register_static(&nvme_bus_info);
9117 }
9118
9119 type_init(nvme_register_types)
9120