1 /*
2 * QEMU NVM Express Controller
3 *
4 * Copyright (c) 2012, Intel Corporation
5 *
6 * Written by Keith Busch <keith.busch@intel.com>
7 *
8 * This code is licensed under the GNU GPL v2 or later.
9 */
10
11 /**
12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13 *
14 * https://nvmexpress.org/developers/nvme-specification/
15 *
16 *
17 * Notes on coding style
18 * ---------------------
19 * While QEMU coding style prefers lowercase hexadecimals in constants, the
20 * NVMe subsystem use this format from the NVMe specifications in the comments
21 * (i.e. 'h' suffix instead of '0x' prefix).
22 *
23 * Usage
24 * -----
25 * See docs/system/devices/nvme.rst for extensive documentation.
26 *
27 * Add options:
28 * -drive file=<file>,if=none,id=<drive_id>
29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30 * -device nvme,serial=<serial>,id=<bus_name>, \
31 * cmb_size_mb=<cmb_size_mb[optional]>, \
32 * [pmrdev=<mem_backend_file_id>,] \
33 * max_ioqpairs=<N[optional]>, \
34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35 * mdts=<N[optional]>,vsl=<N[optional]>, \
36 * zoned.zasl=<N[optional]>, \
37 * zoned.auto_transition=<on|off[optional]>, \
38 * sriov_max_vfs=<N[optional]> \
39 * sriov_vq_flexible=<N[optional]> \
40 * sriov_vi_flexible=<N[optional]> \
41 * sriov_max_vi_per_vf=<N[optional]> \
42 * sriov_max_vq_per_vf=<N[optional]> \
43 * atomic.dn=<on|off[optional]>, \
44 * atomic.awun<N[optional]>, \
45 * atomic.awupf<N[optional]>, \
46 * subsys=<subsys_id>
47 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
48 * zoned=<true|false[optional]>, \
49 * subsys=<subsys_id>,shared=<true|false[optional]>, \
50 * detached=<true|false[optional]>, \
51 * zoned.zone_size=<N[optional]>, \
52 * zoned.zone_capacity=<N[optional]>, \
53 * zoned.descr_ext_size=<N[optional]>, \
54 * zoned.max_active=<N[optional]>, \
55 * zoned.max_open=<N[optional]>, \
56 * zoned.cross_read=<true|false[optional]>
57 *
58 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
59 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
60 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
61 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
62 *
63 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
64 * For example:
65 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
66 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
67 *
68 * The PMR will use BAR 4/5 exclusively.
69 *
70 * To place controller(s) and namespace(s) to a subsystem, then provide
71 * nvme-subsys device as above.
72 *
73 * nvme subsystem device parameters
74 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
75 * - `nqn`
76 * This parameter provides the `<nqn_id>` part of the string
77 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
78 * of subsystem controllers. Note that `<nqn_id>` should be unique per
79 * subsystem, but this is not enforced by QEMU. If not specified, it will
80 * default to the value of the `id` parameter (`<subsys_id>`).
81 *
82 * nvme device parameters
83 * ~~~~~~~~~~~~~~~~~~~~~~
84 * - `subsys`
85 * Specifying this parameter attaches the controller to the subsystem and
86 * the SUBNQN field in the controller will report the NQN of the subsystem
87 * device. This also enables multi controller capability represented in
88 * Identify Controller data structure in CMIC (Controller Multi-path I/O and
89 * Namespace Sharing Capabilities).
90 *
91 * - `aerl`
92 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number
93 * of concurrently outstanding Asynchronous Event Request commands support
94 * by the controller. This is a 0's based value.
95 *
96 * - `aer_max_queued`
97 * This is the maximum number of events that the device will enqueue for
98 * completion when there are no outstanding AERs. When the maximum number of
99 * enqueued events are reached, subsequent events will be dropped.
100 *
101 * - `mdts`
102 * Indicates the maximum data transfer size for a command that transfers data
103 * between host-accessible memory and the controller. The value is specified
104 * as a power of two (2^n) and is in units of the minimum memory page size
105 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
106 *
107 * - `vsl`
108 * Indicates the maximum data size limit for the Verify command. Like `mdts`,
109 * this value is specified as a power of two (2^n) and is in units of the
110 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
111 * KiB).
112 *
113 * - `zoned.zasl`
114 * Indicates the maximum data transfer size for the Zone Append command. Like
115 * `mdts`, the value is specified as a power of two (2^n) and is in units of
116 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
117 * defaulting to the value of `mdts`).
118 *
119 * - `zoned.auto_transition`
120 * Indicates if zones in zone state implicitly opened can be automatically
121 * transitioned to zone state closed for resource management purposes.
122 * Defaults to 'on'.
123 *
124 * - `sriov_max_vfs`
125 * Indicates the maximum number of PCIe virtual functions supported
126 * by the controller. The default value is 0. Specifying a non-zero value
127 * enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
128 * Virtual function controllers will not report SR-IOV capability.
129 *
130 * NOTE: Single Root I/O Virtualization support is experimental.
131 * All the related parameters may be subject to change.
132 *
133 * - `sriov_vq_flexible`
134 * Indicates the total number of flexible queue resources assignable to all
135 * the secondary controllers. Implicitly sets the number of primary
136 * controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`.
137 *
138 * - `sriov_vi_flexible`
139 * Indicates the total number of flexible interrupt resources assignable to
140 * all the secondary controllers. Implicitly sets the number of primary
141 * controller's private resources to `(msix_qsize - sriov_vi_flexible)`.
142 *
143 * - `sriov_max_vi_per_vf`
144 * Indicates the maximum number of virtual interrupt resources assignable
145 * to a secondary controller. The default 0 resolves to
146 * `(sriov_vi_flexible / sriov_max_vfs)`.
147 *
148 * - `sriov_max_vq_per_vf`
149 * Indicates the maximum number of virtual queue resources assignable to
150 * a secondary controller. The default 0 resolves to
151 * `(sriov_vq_flexible / sriov_max_vfs)`.
152 *
153 * nvme namespace device parameters
154 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
155 * - `shared`
156 * When the parent nvme device (as defined explicitly by the 'bus' parameter
157 * or implicitly by the most recently defined NvmeBus) is linked to an
158 * nvme-subsys device, the namespace will be attached to all controllers in
159 * the subsystem. If set to 'off' (the default), the namespace will remain a
160 * private namespace and may only be attached to a single controller at a
161 * time.
162 *
163 * - `detached`
164 * This parameter is only valid together with the `subsys` parameter. If left
165 * at the default value (`false/off`), the namespace will be attached to all
166 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
167 * namespace will be available in the subsystem but not attached to any
168 * controllers.
169 *
170 * Setting `zoned` to true selects Zoned Command Set at the namespace.
171 * In this case, the following namespace properties are available to configure
172 * zoned operation:
173 * zoned.zone_size=<zone size in bytes, default: 128MiB>
174 * The number may be followed by K, M, G as in kilo-, mega- or giga-.
175 *
176 * zoned.zone_capacity=<zone capacity in bytes, default: zone size>
177 * The value 0 (default) forces zone capacity to be the same as zone
178 * size. The value of this property may not exceed zone size.
179 *
180 * zoned.descr_ext_size=<zone descriptor extension size, default 0>
181 * This value needs to be specified in 64B units. If it is zero,
182 * namespace(s) will not support zone descriptor extensions.
183 *
184 * zoned.max_active=<Maximum Active Resources (zones), default: 0>
185 * The default value means there is no limit to the number of
186 * concurrently active zones.
187 *
188 * zoned.max_open=<Maximum Open Resources (zones), default: 0>
189 * The default value means there is no limit to the number of
190 * concurrently open zones.
191 *
192 * zoned.cross_read=<enable RAZB, default: false>
193 * Setting this property to true enables Read Across Zone Boundaries.
194 */
195
196 #include "qemu/osdep.h"
197 #include "qemu/cutils.h"
198 #include "qemu/error-report.h"
199 #include "qemu/log.h"
200 #include "qemu/units.h"
201 #include "qemu/range.h"
202 #include "qapi/error.h"
203 #include "qapi/visitor.h"
204 #include "system/system.h"
205 #include "system/block-backend.h"
206 #include "system/hostmem.h"
207 #include "hw/pci/msix.h"
208 #include "hw/pci/pcie_sriov.h"
209 #include "system/spdm-socket.h"
210 #include "migration/vmstate.h"
211
212 #include "nvme.h"
213 #include "dif.h"
214 #include "trace.h"
215
216 #define NVME_MAX_IOQPAIRS 0xffff
217 #define NVME_DB_SIZE 4
218 #define NVME_SPEC_VER 0x00010400
219 #define NVME_CMB_BIR 2
220 #define NVME_PMR_BIR 4
221 #define NVME_TEMPERATURE 0x143
222 #define NVME_TEMPERATURE_WARNING 0x157
223 #define NVME_TEMPERATURE_CRITICAL 0x175
224 #define NVME_NUM_FW_SLOTS 1
225 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
226 #define NVME_VF_RES_GRANULARITY 1
227 #define NVME_VF_OFFSET 0x1
228 #define NVME_VF_STRIDE 1
229
230 #define NVME_GUEST_ERR(trace, fmt, ...) \
231 do { \
232 (trace_##trace)(__VA_ARGS__); \
233 qemu_log_mask(LOG_GUEST_ERROR, #trace \
234 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
235 } while (0)
236
237 static const bool nvme_feature_support[NVME_FID_MAX] = {
238 [NVME_ARBITRATION] = true,
239 [NVME_POWER_MANAGEMENT] = true,
240 [NVME_TEMPERATURE_THRESHOLD] = true,
241 [NVME_ERROR_RECOVERY] = true,
242 [NVME_VOLATILE_WRITE_CACHE] = true,
243 [NVME_NUMBER_OF_QUEUES] = true,
244 [NVME_INTERRUPT_COALESCING] = true,
245 [NVME_INTERRUPT_VECTOR_CONF] = true,
246 [NVME_WRITE_ATOMICITY] = true,
247 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
248 [NVME_TIMESTAMP] = true,
249 [NVME_HOST_BEHAVIOR_SUPPORT] = true,
250 [NVME_COMMAND_SET_PROFILE] = true,
251 [NVME_FDP_MODE] = true,
252 [NVME_FDP_EVENTS] = true,
253 };
254
255 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
256 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
257 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
258 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
259 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
260 [NVME_WRITE_ATOMICITY] = NVME_FEAT_CAP_CHANGE,
261 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
262 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
263 [NVME_HOST_BEHAVIOR_SUPPORT] = NVME_FEAT_CAP_CHANGE,
264 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
265 [NVME_FDP_MODE] = NVME_FEAT_CAP_CHANGE,
266 [NVME_FDP_EVENTS] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
267 };
268
269 static const uint32_t nvme_cse_acs_default[256] = {
270 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
271 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
272 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
273 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
274 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
275 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
276 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
277 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
278 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
279 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
280 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC |
281 NVME_CMD_EFF_CCC,
282 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
283 [NVME_ADM_CMD_DIRECTIVE_RECV] = NVME_CMD_EFF_CSUPP,
284 [NVME_ADM_CMD_DIRECTIVE_SEND] = NVME_CMD_EFF_CSUPP,
285 };
286
287 static const uint32_t nvme_cse_iocs_nvm_default[256] = {
288 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
289 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
290 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
291 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
292 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
293 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
294 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
295 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
296 [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
297 [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
298 };
299
300 static const uint32_t nvme_cse_iocs_zoned_default[256] = {
301 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
302 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
303 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
304 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
305 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
306 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
307 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
308 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
309 [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
310 [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
311
312 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
313 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
314 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
315 };
316
317 static void nvme_process_sq(void *opaque);
318 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
319 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n);
320
nvme_sqid(NvmeRequest * req)321 static uint16_t nvme_sqid(NvmeRequest *req)
322 {
323 return le16_to_cpu(req->sq->sqid);
324 }
325
nvme_make_pid(NvmeNamespace * ns,uint16_t rg,uint16_t ph)326 static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg,
327 uint16_t ph)
328 {
329 uint16_t rgif = ns->endgrp->fdp.rgif;
330
331 if (!rgif) {
332 return ph;
333 }
334
335 return (rg << (16 - rgif)) | ph;
336 }
337
nvme_ph_valid(NvmeNamespace * ns,uint16_t ph)338 static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph)
339 {
340 return ph < ns->fdp.nphs;
341 }
342
nvme_rg_valid(NvmeEnduranceGroup * endgrp,uint16_t rg)343 static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg)
344 {
345 return rg < endgrp->fdp.nrg;
346 }
347
nvme_pid2ph(NvmeNamespace * ns,uint16_t pid)348 static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid)
349 {
350 uint16_t rgif = ns->endgrp->fdp.rgif;
351
352 if (!rgif) {
353 return pid;
354 }
355
356 return pid & ((1 << (15 - rgif)) - 1);
357 }
358
nvme_pid2rg(NvmeNamespace * ns,uint16_t pid)359 static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid)
360 {
361 uint16_t rgif = ns->endgrp->fdp.rgif;
362
363 if (!rgif) {
364 return 0;
365 }
366
367 return pid >> (16 - rgif);
368 }
369
nvme_parse_pid(NvmeNamespace * ns,uint16_t pid,uint16_t * ph,uint16_t * rg)370 static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid,
371 uint16_t *ph, uint16_t *rg)
372 {
373 *rg = nvme_pid2rg(ns, pid);
374 *ph = nvme_pid2ph(ns, pid);
375
376 return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg);
377 }
378
nvme_assign_zone_state(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state)379 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
380 NvmeZoneState state)
381 {
382 if (QTAILQ_IN_USE(zone, entry)) {
383 switch (nvme_get_zone_state(zone)) {
384 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
385 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
386 break;
387 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
388 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
389 break;
390 case NVME_ZONE_STATE_CLOSED:
391 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
392 break;
393 case NVME_ZONE_STATE_FULL:
394 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
395 default:
396 ;
397 }
398 }
399
400 nvme_set_zone_state(zone, state);
401
402 switch (state) {
403 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
404 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
405 break;
406 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
407 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
408 break;
409 case NVME_ZONE_STATE_CLOSED:
410 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
411 break;
412 case NVME_ZONE_STATE_FULL:
413 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
414 case NVME_ZONE_STATE_READ_ONLY:
415 break;
416 default:
417 zone->d.za = 0;
418 }
419 }
420
nvme_zns_check_resources(NvmeNamespace * ns,uint32_t act,uint32_t opn,uint32_t zrwa)421 static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
422 uint32_t opn, uint32_t zrwa)
423 {
424 if (ns->params.max_active_zones != 0 &&
425 ns->nr_active_zones + act > ns->params.max_active_zones) {
426 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
427 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
428 }
429
430 if (ns->params.max_open_zones != 0 &&
431 ns->nr_open_zones + opn > ns->params.max_open_zones) {
432 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
433 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
434 }
435
436 if (zrwa > ns->zns.numzrwa) {
437 return NVME_NOZRWA | NVME_DNR;
438 }
439
440 return NVME_SUCCESS;
441 }
442
443 /*
444 * Check if we can open a zone without exceeding open/active limits.
445 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
446 */
nvme_aor_check(NvmeNamespace * ns,uint32_t act,uint32_t opn)447 static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
448 {
449 return nvme_zns_check_resources(ns, act, opn, 0);
450 }
451
nvme_fdp_alloc_event(NvmeCtrl * n,NvmeFdpEventBuffer * ebuf)452 static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf)
453 {
454 NvmeFdpEvent *ret = NULL;
455 bool is_full = ebuf->next == ebuf->start && ebuf->nelems;
456
457 ret = &ebuf->events[ebuf->next++];
458 if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) {
459 ebuf->next = 0;
460 }
461 if (is_full) {
462 ebuf->start = ebuf->next;
463 } else {
464 ebuf->nelems++;
465 }
466
467 memset(ret, 0, sizeof(NvmeFdpEvent));
468 ret->timestamp = nvme_get_timestamp(n);
469
470 return ret;
471 }
472
log_event(NvmeRuHandle * ruh,uint8_t event_type)473 static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type)
474 {
475 return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1;
476 }
477
nvme_update_ruh(NvmeCtrl * n,NvmeNamespace * ns,uint16_t pid)478 static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid)
479 {
480 NvmeEnduranceGroup *endgrp = ns->endgrp;
481 NvmeRuHandle *ruh;
482 NvmeReclaimUnit *ru;
483 NvmeFdpEvent *e = NULL;
484 uint16_t ph, rg, ruhid;
485
486 if (!nvme_parse_pid(ns, pid, &ph, &rg)) {
487 return false;
488 }
489
490 ruhid = ns->fdp.phs[ph];
491
492 ruh = &endgrp->fdp.ruhs[ruhid];
493 ru = &ruh->rus[rg];
494
495 if (ru->ruamw) {
496 if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) {
497 e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events);
498 e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN;
499 e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV;
500 e->pid = cpu_to_le16(pid);
501 e->nsid = cpu_to_le32(ns->params.nsid);
502 e->rgid = cpu_to_le16(rg);
503 e->ruhid = cpu_to_le16(ruhid);
504 }
505
506 /* log (eventual) GC overhead of prematurely swapping the RU */
507 nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw));
508 }
509
510 ru->ruamw = ruh->ruamw;
511
512 return true;
513 }
514
nvme_addr_is_cmb(NvmeCtrl * n,hwaddr addr)515 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
516 {
517 hwaddr hi, lo;
518
519 if (!n->cmb.cmse) {
520 return false;
521 }
522
523 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
524 hi = lo + int128_get64(n->cmb.mem.size);
525
526 return addr >= lo && addr < hi;
527 }
528
nvme_addr_to_cmb(NvmeCtrl * n,hwaddr addr)529 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
530 {
531 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
532 return &n->cmb.buf[addr - base];
533 }
534
nvme_addr_is_pmr(NvmeCtrl * n,hwaddr addr)535 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
536 {
537 hwaddr hi;
538
539 if (!n->pmr.cmse) {
540 return false;
541 }
542
543 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
544
545 return addr >= n->pmr.cba && addr < hi;
546 }
547
nvme_addr_to_pmr(NvmeCtrl * n,hwaddr addr)548 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
549 {
550 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
551 }
552
nvme_addr_is_iomem(NvmeCtrl * n,hwaddr addr)553 static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
554 {
555 hwaddr hi, lo;
556
557 /*
558 * The purpose of this check is to guard against invalid "local" access to
559 * the iomem (i.e. controller registers). Thus, we check against the range
560 * covered by the 'bar0' MemoryRegion since that is currently composed of
561 * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however,
562 * that if the device model is ever changed to allow the CMB to be located
563 * in BAR0 as well, then this must be changed.
564 */
565 lo = n->bar0.addr;
566 hi = lo + int128_get64(n->bar0.size);
567
568 return addr >= lo && addr < hi;
569 }
570
nvme_addr_read(NvmeCtrl * n,hwaddr addr,void * buf,int size)571 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
572 {
573 hwaddr hi = addr + size - 1;
574 if (hi < addr) {
575 return 1;
576 }
577
578 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
579 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
580 return 0;
581 }
582
583 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
584 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
585 return 0;
586 }
587
588 return pci_dma_read(PCI_DEVICE(n), addr, buf, size);
589 }
590
nvme_addr_write(NvmeCtrl * n,hwaddr addr,const void * buf,int size)591 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
592 {
593 hwaddr hi = addr + size - 1;
594 if (hi < addr) {
595 return 1;
596 }
597
598 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
599 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
600 return 0;
601 }
602
603 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
604 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
605 return 0;
606 }
607
608 return pci_dma_write(PCI_DEVICE(n), addr, buf, size);
609 }
610
nvme_nsid_valid(NvmeCtrl * n,uint32_t nsid)611 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
612 {
613 return nsid &&
614 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
615 }
616
nvme_check_sqid(NvmeCtrl * n,uint16_t sqid)617 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
618 {
619 return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
620 }
621
nvme_check_cqid(NvmeCtrl * n,uint16_t cqid)622 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
623 {
624 return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
625 }
626
nvme_inc_cq_tail(NvmeCQueue * cq)627 static void nvme_inc_cq_tail(NvmeCQueue *cq)
628 {
629 cq->tail++;
630 if (cq->tail >= cq->size) {
631 cq->tail = 0;
632 cq->phase = !cq->phase;
633 }
634 }
635
nvme_inc_sq_head(NvmeSQueue * sq)636 static void nvme_inc_sq_head(NvmeSQueue *sq)
637 {
638 sq->head = (sq->head + 1) % sq->size;
639 }
640
nvme_cq_full(NvmeCQueue * cq)641 static uint8_t nvme_cq_full(NvmeCQueue *cq)
642 {
643 return (cq->tail + 1) % cq->size == cq->head;
644 }
645
nvme_sq_empty(NvmeSQueue * sq)646 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
647 {
648 return sq->head == sq->tail;
649 }
650
nvme_irq_check(NvmeCtrl * n)651 static void nvme_irq_check(NvmeCtrl *n)
652 {
653 PCIDevice *pci = PCI_DEVICE(n);
654 uint32_t intms = ldl_le_p(&n->bar.intms);
655
656 if (msix_enabled(pci)) {
657 return;
658 }
659
660 /* vfs does not implement intx */
661 if (pci_is_vf(pci)) {
662 return;
663 }
664
665 if (~intms & n->irq_status) {
666 pci_irq_assert(pci);
667 } else {
668 pci_irq_deassert(pci);
669 }
670 }
671
nvme_irq_assert(NvmeCtrl * n,NvmeCQueue * cq)672 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
673 {
674 PCIDevice *pci = PCI_DEVICE(n);
675
676 if (cq->irq_enabled) {
677 if (msix_enabled(pci)) {
678 trace_pci_nvme_irq_msix(cq->vector);
679 msix_notify(pci, cq->vector);
680 } else {
681 trace_pci_nvme_irq_pin();
682 assert(cq->vector < 32);
683 n->irq_status |= 1 << cq->vector;
684 nvme_irq_check(n);
685 }
686 } else {
687 trace_pci_nvme_irq_masked();
688 }
689 }
690
nvme_irq_deassert(NvmeCtrl * n,NvmeCQueue * cq)691 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
692 {
693 if (cq->irq_enabled) {
694 if (msix_enabled(PCI_DEVICE(n))) {
695 return;
696 } else {
697 assert(cq->vector < 32);
698 if (!n->cq_pending) {
699 n->irq_status &= ~(1 << cq->vector);
700 }
701 nvme_irq_check(n);
702 }
703 }
704 }
705
nvme_req_clear(NvmeRequest * req)706 static void nvme_req_clear(NvmeRequest *req)
707 {
708 req->ns = NULL;
709 req->opaque = NULL;
710 req->aiocb = NULL;
711 memset(&req->cqe, 0x0, sizeof(req->cqe));
712 req->status = NVME_SUCCESS;
713 }
714
nvme_sg_init(NvmeCtrl * n,NvmeSg * sg,bool dma)715 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
716 {
717 if (dma) {
718 pci_dma_sglist_init(&sg->qsg, PCI_DEVICE(n), 0);
719 sg->flags = NVME_SG_DMA;
720 } else {
721 qemu_iovec_init(&sg->iov, 0);
722 }
723
724 sg->flags |= NVME_SG_ALLOC;
725 }
726
nvme_sg_unmap(NvmeSg * sg)727 static inline void nvme_sg_unmap(NvmeSg *sg)
728 {
729 if (!(sg->flags & NVME_SG_ALLOC)) {
730 return;
731 }
732
733 if (sg->flags & NVME_SG_DMA) {
734 qemu_sglist_destroy(&sg->qsg);
735 } else {
736 qemu_iovec_destroy(&sg->iov);
737 }
738
739 memset(sg, 0x0, sizeof(*sg));
740 }
741
742 /*
743 * When metadata is transferred as extended LBAs, the DPTR mapped into `sg`
744 * holds both data and metadata. This function splits the data and metadata
745 * into two separate QSG/IOVs.
746 */
nvme_sg_split(NvmeSg * sg,NvmeNamespace * ns,NvmeSg * data,NvmeSg * mdata)747 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
748 NvmeSg *mdata)
749 {
750 NvmeSg *dst = data;
751 uint32_t trans_len, count = ns->lbasz;
752 uint64_t offset = 0;
753 bool dma = sg->flags & NVME_SG_DMA;
754 size_t sge_len;
755 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
756 int sg_idx = 0;
757
758 assert(sg->flags & NVME_SG_ALLOC);
759
760 while (sg_len) {
761 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
762
763 trans_len = MIN(sg_len, count);
764 trans_len = MIN(trans_len, sge_len - offset);
765
766 if (dst) {
767 if (dma) {
768 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
769 trans_len);
770 } else {
771 qemu_iovec_add(&dst->iov,
772 sg->iov.iov[sg_idx].iov_base + offset,
773 trans_len);
774 }
775 }
776
777 sg_len -= trans_len;
778 count -= trans_len;
779 offset += trans_len;
780
781 if (count == 0) {
782 dst = (dst == data) ? mdata : data;
783 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
784 }
785
786 if (sge_len == offset) {
787 offset = 0;
788 sg_idx++;
789 }
790 }
791 }
792
nvme_map_addr_cmb(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)793 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
794 size_t len)
795 {
796 if (!len) {
797 return NVME_SUCCESS;
798 }
799
800 trace_pci_nvme_map_addr_cmb(addr, len);
801
802 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
803 return NVME_DATA_TRAS_ERROR;
804 }
805
806 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
807
808 return NVME_SUCCESS;
809 }
810
nvme_map_addr_pmr(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)811 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
812 size_t len)
813 {
814 if (!len) {
815 return NVME_SUCCESS;
816 }
817
818 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
819 return NVME_DATA_TRAS_ERROR;
820 }
821
822 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
823
824 return NVME_SUCCESS;
825 }
826
nvme_map_addr(NvmeCtrl * n,NvmeSg * sg,hwaddr addr,size_t len)827 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
828 {
829 bool cmb = false, pmr = false;
830
831 if (!len) {
832 return NVME_SUCCESS;
833 }
834
835 trace_pci_nvme_map_addr(addr, len);
836
837 if (nvme_addr_is_iomem(n, addr)) {
838 return NVME_DATA_TRAS_ERROR;
839 }
840
841 if (nvme_addr_is_cmb(n, addr)) {
842 cmb = true;
843 } else if (nvme_addr_is_pmr(n, addr)) {
844 pmr = true;
845 }
846
847 if (cmb || pmr) {
848 if (sg->flags & NVME_SG_DMA) {
849 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
850 }
851
852 if (sg->iov.niov + 1 > IOV_MAX) {
853 goto max_mappings_exceeded;
854 }
855
856 if (cmb) {
857 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
858 } else {
859 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
860 }
861 }
862
863 if (!(sg->flags & NVME_SG_DMA)) {
864 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
865 }
866
867 if (sg->qsg.nsg + 1 > IOV_MAX) {
868 goto max_mappings_exceeded;
869 }
870
871 qemu_sglist_add(&sg->qsg, addr, len);
872
873 return NVME_SUCCESS;
874
875 max_mappings_exceeded:
876 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
877 "number of mappings exceed 1024");
878 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
879 }
880
nvme_addr_is_dma(NvmeCtrl * n,hwaddr addr)881 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
882 {
883 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
884 }
885
nvme_map_prp(NvmeCtrl * n,NvmeSg * sg,uint64_t prp1,uint64_t prp2,uint32_t len)886 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
887 uint64_t prp2, uint32_t len)
888 {
889 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
890 trans_len = MIN(len, trans_len);
891 int num_prps = (len >> n->page_bits) + 1;
892 uint16_t status;
893 int ret;
894
895 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
896
897 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
898
899 status = nvme_map_addr(n, sg, prp1, trans_len);
900 if (status) {
901 goto unmap;
902 }
903
904 len -= trans_len;
905 if (len) {
906 if (len > n->page_size) {
907 g_autofree uint64_t *prp_list = g_new(uint64_t, n->max_prp_ents);
908 uint32_t nents, prp_trans;
909 int i = 0;
910
911 /*
912 * The first PRP list entry, pointed to by PRP2 may contain offset.
913 * Hence, we need to calculate the number of entries in based on
914 * that offset.
915 */
916 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
917 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
918 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
919 if (ret) {
920 trace_pci_nvme_err_addr_read(prp2);
921 status = NVME_DATA_TRAS_ERROR;
922 goto unmap;
923 }
924 while (len != 0) {
925 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
926
927 if (i == nents - 1 && len > n->page_size) {
928 if (unlikely(prp_ent & (n->page_size - 1))) {
929 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
930 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
931 goto unmap;
932 }
933
934 i = 0;
935 nents = (len + n->page_size - 1) >> n->page_bits;
936 nents = MIN(nents, n->max_prp_ents);
937 prp_trans = nents * sizeof(uint64_t);
938 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
939 prp_trans);
940 if (ret) {
941 trace_pci_nvme_err_addr_read(prp_ent);
942 status = NVME_DATA_TRAS_ERROR;
943 goto unmap;
944 }
945 prp_ent = le64_to_cpu(prp_list[i]);
946 }
947
948 if (unlikely(prp_ent & (n->page_size - 1))) {
949 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
950 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
951 goto unmap;
952 }
953
954 trans_len = MIN(len, n->page_size);
955 status = nvme_map_addr(n, sg, prp_ent, trans_len);
956 if (status) {
957 goto unmap;
958 }
959
960 len -= trans_len;
961 i++;
962 }
963 } else {
964 if (unlikely(prp2 & (n->page_size - 1))) {
965 trace_pci_nvme_err_invalid_prp2_align(prp2);
966 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
967 goto unmap;
968 }
969 status = nvme_map_addr(n, sg, prp2, len);
970 if (status) {
971 goto unmap;
972 }
973 }
974 }
975
976 return NVME_SUCCESS;
977
978 unmap:
979 nvme_sg_unmap(sg);
980 return status;
981 }
982
983 /*
984 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
985 * number of bytes mapped in len.
986 */
nvme_map_sgl_data(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor * segment,uint64_t nsgld,size_t * len,NvmeCmd * cmd)987 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
988 NvmeSglDescriptor *segment, uint64_t nsgld,
989 size_t *len, NvmeCmd *cmd)
990 {
991 dma_addr_t addr, trans_len;
992 uint32_t dlen;
993 uint16_t status;
994
995 for (int i = 0; i < nsgld; i++) {
996 uint8_t type = NVME_SGL_TYPE(segment[i].type);
997
998 switch (type) {
999 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
1000 break;
1001 case NVME_SGL_DESCR_TYPE_SEGMENT:
1002 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1003 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
1004 default:
1005 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
1006 }
1007
1008 dlen = le32_to_cpu(segment[i].len);
1009
1010 if (!dlen) {
1011 continue;
1012 }
1013
1014 if (*len == 0) {
1015 /*
1016 * All data has been mapped, but the SGL contains additional
1017 * segments and/or descriptors. The controller might accept
1018 * ignoring the rest of the SGL.
1019 */
1020 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
1021 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
1022 break;
1023 }
1024
1025 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
1026 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1027 }
1028
1029 trans_len = MIN(*len, dlen);
1030
1031 addr = le64_to_cpu(segment[i].addr);
1032
1033 if (UINT64_MAX - addr < dlen) {
1034 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1035 }
1036
1037 status = nvme_map_addr(n, sg, addr, trans_len);
1038 if (status) {
1039 return status;
1040 }
1041
1042 *len -= trans_len;
1043 }
1044
1045 return NVME_SUCCESS;
1046 }
1047
nvme_map_sgl(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor sgl,size_t len,NvmeCmd * cmd)1048 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
1049 size_t len, NvmeCmd *cmd)
1050 {
1051 /*
1052 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
1053 * dynamically allocating a potentially huge SGL. The spec allows the SGL
1054 * to be larger (as in number of bytes required to describe the SGL
1055 * descriptors and segment chain) than the command transfer size, so it is
1056 * not bounded by MDTS.
1057 */
1058 #define SEG_CHUNK_SIZE 256
1059
1060 QEMU_UNINITIALIZED NvmeSglDescriptor segment[SEG_CHUNK_SIZE];
1061 NvmeSglDescriptor *sgld, *last_sgld;
1062 uint64_t nsgld;
1063 uint32_t seg_len;
1064 uint16_t status;
1065 hwaddr addr;
1066 int ret;
1067
1068 sgld = &sgl;
1069 addr = le64_to_cpu(sgl.addr);
1070
1071 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
1072
1073 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
1074
1075 /*
1076 * If the entire transfer can be described with a single data block it can
1077 * be mapped directly.
1078 */
1079 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1080 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
1081 if (status) {
1082 goto unmap;
1083 }
1084
1085 goto out;
1086 }
1087
1088 for (;;) {
1089 switch (NVME_SGL_TYPE(sgld->type)) {
1090 case NVME_SGL_DESCR_TYPE_SEGMENT:
1091 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1092 break;
1093 default:
1094 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1095 }
1096
1097 seg_len = le32_to_cpu(sgld->len);
1098
1099 /* check the length of the (Last) Segment descriptor */
1100 if (!seg_len || seg_len & 0xf) {
1101 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1102 }
1103
1104 if (UINT64_MAX - addr < seg_len) {
1105 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1106 }
1107
1108 nsgld = seg_len / sizeof(NvmeSglDescriptor);
1109
1110 while (nsgld > SEG_CHUNK_SIZE) {
1111 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
1112 trace_pci_nvme_err_addr_read(addr);
1113 status = NVME_DATA_TRAS_ERROR;
1114 goto unmap;
1115 }
1116
1117 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
1118 &len, cmd);
1119 if (status) {
1120 goto unmap;
1121 }
1122
1123 nsgld -= SEG_CHUNK_SIZE;
1124 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
1125 }
1126
1127 ret = nvme_addr_read(n, addr, segment, nsgld *
1128 sizeof(NvmeSglDescriptor));
1129 if (ret) {
1130 trace_pci_nvme_err_addr_read(addr);
1131 status = NVME_DATA_TRAS_ERROR;
1132 goto unmap;
1133 }
1134
1135 last_sgld = &segment[nsgld - 1];
1136
1137 /*
1138 * If the segment ends with a Data Block, then we are done.
1139 */
1140 if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1141 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
1142 if (status) {
1143 goto unmap;
1144 }
1145
1146 goto out;
1147 }
1148
1149 /*
1150 * If the last descriptor was not a Data Block, then the current
1151 * segment must not be a Last Segment.
1152 */
1153 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
1154 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1155 goto unmap;
1156 }
1157
1158 sgld = last_sgld;
1159 addr = le64_to_cpu(sgld->addr);
1160
1161 /*
1162 * Do not map the last descriptor; it will be a Segment or Last Segment
1163 * descriptor and is handled by the next iteration.
1164 */
1165 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
1166 if (status) {
1167 goto unmap;
1168 }
1169 }
1170
1171 out:
1172 /* if there is any residual left in len, the SGL was too short */
1173 if (len) {
1174 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1175 goto unmap;
1176 }
1177
1178 return NVME_SUCCESS;
1179
1180 unmap:
1181 nvme_sg_unmap(sg);
1182 return status;
1183 }
1184
nvme_map_dptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1185 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1186 NvmeCmd *cmd)
1187 {
1188 uint64_t prp1, prp2;
1189
1190 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
1191 case NVME_PSDT_PRP:
1192 prp1 = le64_to_cpu(cmd->dptr.prp1);
1193 prp2 = le64_to_cpu(cmd->dptr.prp2);
1194
1195 return nvme_map_prp(n, sg, prp1, prp2, len);
1196 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
1197 case NVME_PSDT_SGL_MPTR_SGL:
1198 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
1199 default:
1200 return NVME_INVALID_FIELD;
1201 }
1202 }
1203
nvme_map_mptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1204 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1205 NvmeCmd *cmd)
1206 {
1207 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
1208 hwaddr mptr = le64_to_cpu(cmd->mptr);
1209 uint16_t status;
1210
1211 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1212 NvmeSglDescriptor sgl;
1213
1214 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1215 return NVME_DATA_TRAS_ERROR;
1216 }
1217
1218 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1219 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1220 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1221 }
1222
1223 return status;
1224 }
1225
1226 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1227 status = nvme_map_addr(n, sg, mptr, len);
1228 if (status) {
1229 nvme_sg_unmap(sg);
1230 }
1231
1232 return status;
1233 }
1234
nvme_map_data(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1235 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1236 {
1237 NvmeNamespace *ns = req->ns;
1238 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1239 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1240 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1241 size_t len = nvme_l2b(ns, nlb);
1242 uint16_t status;
1243
1244 if (nvme_ns_ext(ns) &&
1245 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1246 NvmeSg sg;
1247
1248 len += nvme_m2b(ns, nlb);
1249
1250 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1251 if (status) {
1252 return status;
1253 }
1254
1255 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1256 nvme_sg_split(&sg, ns, &req->sg, NULL);
1257 nvme_sg_unmap(&sg);
1258
1259 return NVME_SUCCESS;
1260 }
1261
1262 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1263 }
1264
nvme_map_mdata(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1265 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1266 {
1267 NvmeNamespace *ns = req->ns;
1268 size_t len = nvme_m2b(ns, nlb);
1269 uint16_t status;
1270
1271 if (nvme_ns_ext(ns)) {
1272 NvmeSg sg;
1273
1274 len += nvme_l2b(ns, nlb);
1275
1276 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1277 if (status) {
1278 return status;
1279 }
1280
1281 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1282 nvme_sg_split(&sg, ns, NULL, &req->sg);
1283 nvme_sg_unmap(&sg);
1284
1285 return NVME_SUCCESS;
1286 }
1287
1288 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1289 }
1290
nvme_tx_interleaved(NvmeCtrl * n,NvmeSg * sg,uint8_t * ptr,uint32_t len,uint32_t bytes,int32_t skip_bytes,int64_t offset,NvmeTxDirection dir)1291 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1292 uint32_t len, uint32_t bytes,
1293 int32_t skip_bytes, int64_t offset,
1294 NvmeTxDirection dir)
1295 {
1296 hwaddr addr;
1297 uint32_t trans_len, count = bytes;
1298 bool dma = sg->flags & NVME_SG_DMA;
1299 int64_t sge_len;
1300 int sg_idx = 0;
1301 int ret;
1302
1303 assert(sg->flags & NVME_SG_ALLOC);
1304
1305 while (len) {
1306 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1307
1308 if (sge_len - offset < 0) {
1309 offset -= sge_len;
1310 sg_idx++;
1311 continue;
1312 }
1313
1314 if (sge_len == offset) {
1315 offset = 0;
1316 sg_idx++;
1317 continue;
1318 }
1319
1320 trans_len = MIN(len, count);
1321 trans_len = MIN(trans_len, sge_len - offset);
1322
1323 if (dma) {
1324 addr = sg->qsg.sg[sg_idx].base + offset;
1325 } else {
1326 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1327 }
1328
1329 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1330 ret = nvme_addr_read(n, addr, ptr, trans_len);
1331 } else {
1332 ret = nvme_addr_write(n, addr, ptr, trans_len);
1333 }
1334
1335 if (ret) {
1336 return NVME_DATA_TRAS_ERROR;
1337 }
1338
1339 ptr += trans_len;
1340 len -= trans_len;
1341 count -= trans_len;
1342 offset += trans_len;
1343
1344 if (count == 0) {
1345 count = bytes;
1346 offset += skip_bytes;
1347 }
1348 }
1349
1350 return NVME_SUCCESS;
1351 }
1352
nvme_tx(NvmeCtrl * n,NvmeSg * sg,void * ptr,uint32_t len,NvmeTxDirection dir)1353 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
1354 NvmeTxDirection dir)
1355 {
1356 assert(sg->flags & NVME_SG_ALLOC);
1357
1358 if (sg->flags & NVME_SG_DMA) {
1359 const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1360 dma_addr_t residual;
1361
1362 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1363 dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1364 } else {
1365 dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1366 }
1367
1368 if (unlikely(residual)) {
1369 trace_pci_nvme_err_invalid_dma();
1370 return NVME_INVALID_FIELD | NVME_DNR;
1371 }
1372 } else {
1373 size_t bytes;
1374
1375 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1376 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1377 } else {
1378 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1379 }
1380
1381 if (unlikely(bytes != len)) {
1382 trace_pci_nvme_err_invalid_dma();
1383 return NVME_INVALID_FIELD | NVME_DNR;
1384 }
1385 }
1386
1387 return NVME_SUCCESS;
1388 }
1389
nvme_c2h(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1390 static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
1391 NvmeRequest *req)
1392 {
1393 uint16_t status;
1394
1395 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1396 if (status) {
1397 return status;
1398 }
1399
1400 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1401 }
1402
nvme_h2c(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1403 static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
1404 NvmeRequest *req)
1405 {
1406 uint16_t status;
1407
1408 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1409 if (status) {
1410 return status;
1411 }
1412
1413 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1414 }
1415
nvme_bounce_data(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1416 uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
1417 NvmeTxDirection dir, NvmeRequest *req)
1418 {
1419 NvmeNamespace *ns = req->ns;
1420 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1421 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1422 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1423
1424 if (nvme_ns_ext(ns) &&
1425 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1426 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1427 ns->lbaf.ms, 0, dir);
1428 }
1429
1430 return nvme_tx(n, &req->sg, ptr, len, dir);
1431 }
1432
nvme_bounce_mdata(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1433 uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
1434 NvmeTxDirection dir, NvmeRequest *req)
1435 {
1436 NvmeNamespace *ns = req->ns;
1437 uint16_t status;
1438
1439 if (nvme_ns_ext(ns)) {
1440 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1441 ns->lbasz, ns->lbasz, dir);
1442 }
1443
1444 nvme_sg_unmap(&req->sg);
1445
1446 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1447 if (status) {
1448 return status;
1449 }
1450
1451 return nvme_tx(n, &req->sg, ptr, len, dir);
1452 }
1453
nvme_blk_read(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1454 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1455 uint32_t align, BlockCompletionFunc *cb,
1456 NvmeRequest *req)
1457 {
1458 assert(req->sg.flags & NVME_SG_ALLOC);
1459
1460 if (req->sg.flags & NVME_SG_DMA) {
1461 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req);
1462 } else {
1463 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1464 }
1465 }
1466
nvme_blk_write(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1467 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1468 uint32_t align, BlockCompletionFunc *cb,
1469 NvmeRequest *req)
1470 {
1471 assert(req->sg.flags & NVME_SG_ALLOC);
1472
1473 if (req->sg.flags & NVME_SG_DMA) {
1474 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req);
1475 } else {
1476 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1477 }
1478 }
1479
nvme_update_cq_eventidx(const NvmeCQueue * cq)1480 static void nvme_update_cq_eventidx(const NvmeCQueue *cq)
1481 {
1482 trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head);
1483
1484 stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head,
1485 MEMTXATTRS_UNSPECIFIED);
1486 }
1487
nvme_update_cq_head(NvmeCQueue * cq)1488 static void nvme_update_cq_head(NvmeCQueue *cq)
1489 {
1490 ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, &cq->head,
1491 MEMTXATTRS_UNSPECIFIED);
1492
1493 trace_pci_nvme_update_cq_head(cq->cqid, cq->head);
1494 }
1495
nvme_post_cqes(void * opaque)1496 static void nvme_post_cqes(void *opaque)
1497 {
1498 NvmeCQueue *cq = opaque;
1499 NvmeCtrl *n = cq->ctrl;
1500 NvmeRequest *req, *next;
1501 bool pending = cq->head != cq->tail;
1502 int ret;
1503
1504 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1505 NvmeSQueue *sq;
1506 hwaddr addr;
1507
1508 if (n->dbbuf_enabled) {
1509 nvme_update_cq_eventidx(cq);
1510 nvme_update_cq_head(cq);
1511 }
1512
1513 if (nvme_cq_full(cq)) {
1514 break;
1515 }
1516
1517 sq = req->sq;
1518 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1519 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1520 req->cqe.sq_head = cpu_to_le16(sq->head);
1521 addr = cq->dma_addr + (cq->tail << NVME_CQES);
1522 ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)&req->cqe,
1523 sizeof(req->cqe));
1524 if (ret) {
1525 trace_pci_nvme_err_addr_write(addr);
1526 trace_pci_nvme_err_cfs();
1527 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1528 break;
1529 }
1530
1531 QTAILQ_REMOVE(&cq->req_list, req, entry);
1532
1533 nvme_inc_cq_tail(cq);
1534 nvme_sg_unmap(&req->sg);
1535
1536 if (QTAILQ_EMPTY(&sq->req_list) && !nvme_sq_empty(sq)) {
1537 qemu_bh_schedule(sq->bh);
1538 }
1539
1540 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1541 }
1542 if (cq->tail != cq->head) {
1543 if (cq->irq_enabled && !pending) {
1544 n->cq_pending++;
1545 }
1546
1547 nvme_irq_assert(n, cq);
1548 }
1549 }
1550
nvme_enqueue_req_completion(NvmeCQueue * cq,NvmeRequest * req)1551 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1552 {
1553 assert(cq->cqid == req->sq->cqid);
1554 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1555 le32_to_cpu(req->cqe.result),
1556 le32_to_cpu(req->cqe.dw1),
1557 req->status);
1558
1559 if (req->status) {
1560 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1561 req->status, req->cmd.opcode);
1562 }
1563
1564 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1565 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1566
1567 qemu_bh_schedule(cq->bh);
1568 }
1569
nvme_process_aers(void * opaque)1570 static void nvme_process_aers(void *opaque)
1571 {
1572 NvmeCtrl *n = opaque;
1573 NvmeAsyncEvent *event, *next;
1574
1575 trace_pci_nvme_process_aers(n->aer_queued);
1576
1577 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1578 NvmeRequest *req;
1579 NvmeAerResult *result;
1580
1581 /* can't post cqe if there is nothing to complete */
1582 if (!n->outstanding_aers) {
1583 trace_pci_nvme_no_outstanding_aers();
1584 break;
1585 }
1586
1587 /* ignore if masked (cqe posted, but event not cleared) */
1588 if (n->aer_mask & (1 << event->result.event_type)) {
1589 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1590 continue;
1591 }
1592
1593 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1594 n->aer_queued--;
1595
1596 n->aer_mask |= 1 << event->result.event_type;
1597 n->outstanding_aers--;
1598
1599 req = n->aer_reqs[n->outstanding_aers];
1600
1601 result = (NvmeAerResult *) &req->cqe.result;
1602 result->event_type = event->result.event_type;
1603 result->event_info = event->result.event_info;
1604 result->log_page = event->result.log_page;
1605 g_free(event);
1606
1607 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1608 result->log_page);
1609
1610 nvme_enqueue_req_completion(&n->admin_cq, req);
1611 }
1612 }
1613
nvme_enqueue_event(NvmeCtrl * n,uint8_t event_type,uint8_t event_info,uint8_t log_page)1614 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1615 uint8_t event_info, uint8_t log_page)
1616 {
1617 NvmeAsyncEvent *event;
1618
1619 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1620
1621 if (n->aer_queued == n->params.aer_max_queued) {
1622 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1623 return;
1624 }
1625
1626 event = g_new(NvmeAsyncEvent, 1);
1627 event->result = (NvmeAerResult) {
1628 .event_type = event_type,
1629 .event_info = event_info,
1630 .log_page = log_page,
1631 };
1632
1633 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1634 n->aer_queued++;
1635
1636 nvme_process_aers(n);
1637 }
1638
nvme_smart_event(NvmeCtrl * n,uint8_t event)1639 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1640 {
1641 uint8_t aer_info;
1642
1643 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1644 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1645 return;
1646 }
1647
1648 switch (event) {
1649 case NVME_SMART_SPARE:
1650 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1651 break;
1652 case NVME_SMART_TEMPERATURE:
1653 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1654 break;
1655 case NVME_SMART_RELIABILITY:
1656 case NVME_SMART_MEDIA_READ_ONLY:
1657 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1658 case NVME_SMART_PMR_UNRELIABLE:
1659 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1660 break;
1661 default:
1662 return;
1663 }
1664
1665 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1666 }
1667
nvme_clear_events(NvmeCtrl * n,uint8_t event_type)1668 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1669 {
1670 NvmeAsyncEvent *event, *next;
1671
1672 n->aer_mask &= ~(1 << event_type);
1673
1674 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1675 if (event->result.event_type == event_type) {
1676 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1677 n->aer_queued--;
1678 g_free(event);
1679 }
1680 }
1681 }
1682
nvme_check_mdts(NvmeCtrl * n,size_t len)1683 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1684 {
1685 uint8_t mdts = n->params.mdts;
1686
1687 if (mdts && len > n->page_size << mdts) {
1688 trace_pci_nvme_err_mdts(len);
1689 return NVME_INVALID_FIELD | NVME_DNR;
1690 }
1691
1692 return NVME_SUCCESS;
1693 }
1694
nvme_check_bounds(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1695 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1696 uint32_t nlb)
1697 {
1698 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1699
1700 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1701 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1702 return NVME_LBA_RANGE | NVME_DNR;
1703 }
1704
1705 return NVME_SUCCESS;
1706 }
1707
nvme_block_status_all(NvmeNamespace * ns,uint64_t slba,uint32_t nlb,int flags)1708 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1709 uint32_t nlb, int flags)
1710 {
1711 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1712
1713 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1714 int64_t offset = nvme_l2b(ns, slba);
1715 int ret;
1716
1717 /*
1718 * `pnum` holds the number of bytes after offset that shares the same
1719 * allocation status as the byte at offset. If `pnum` is different from
1720 * `bytes`, we should check the allocation status of the next range and
1721 * continue this until all bytes have been checked.
1722 */
1723 do {
1724 bytes -= pnum;
1725
1726 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1727 if (ret < 0) {
1728 return ret;
1729 }
1730
1731
1732 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1733 !!(ret & BDRV_BLOCK_ZERO));
1734
1735 if (!(ret & flags)) {
1736 return 1;
1737 }
1738
1739 offset += pnum;
1740 } while (pnum != bytes);
1741
1742 return 0;
1743 }
1744
nvme_check_dulbe(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1745 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1746 uint32_t nlb)
1747 {
1748 int ret;
1749 Error *err = NULL;
1750
1751 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1752 if (ret) {
1753 if (ret < 0) {
1754 error_setg_errno(&err, -ret, "unable to get block status");
1755 error_report_err(err);
1756
1757 return NVME_INTERNAL_DEV_ERROR;
1758 }
1759
1760 return NVME_DULB;
1761 }
1762
1763 return NVME_SUCCESS;
1764 }
1765
nvme_zone_idx(NvmeNamespace * ns,uint64_t slba)1766 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1767 {
1768 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1769 slba / ns->zone_size;
1770 }
1771
nvme_get_zone_by_slba(NvmeNamespace * ns,uint64_t slba)1772 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1773 {
1774 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1775
1776 if (zone_idx >= ns->num_zones) {
1777 return NULL;
1778 }
1779
1780 return &ns->zone_array[zone_idx];
1781 }
1782
nvme_check_zone_state_for_write(NvmeZone * zone)1783 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1784 {
1785 uint64_t zslba = zone->d.zslba;
1786
1787 switch (nvme_get_zone_state(zone)) {
1788 case NVME_ZONE_STATE_EMPTY:
1789 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1790 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1791 case NVME_ZONE_STATE_CLOSED:
1792 return NVME_SUCCESS;
1793 case NVME_ZONE_STATE_FULL:
1794 trace_pci_nvme_err_zone_is_full(zslba);
1795 return NVME_ZONE_FULL;
1796 case NVME_ZONE_STATE_OFFLINE:
1797 trace_pci_nvme_err_zone_is_offline(zslba);
1798 return NVME_ZONE_OFFLINE;
1799 case NVME_ZONE_STATE_READ_ONLY:
1800 trace_pci_nvme_err_zone_is_read_only(zslba);
1801 return NVME_ZONE_READ_ONLY;
1802 default:
1803 g_assert_not_reached();
1804 }
1805
1806 return NVME_INTERNAL_DEV_ERROR;
1807 }
1808
nvme_check_zone_write(NvmeNamespace * ns,NvmeZone * zone,uint64_t slba,uint32_t nlb)1809 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1810 uint64_t slba, uint32_t nlb)
1811 {
1812 uint64_t zcap = nvme_zone_wr_boundary(zone);
1813 uint16_t status;
1814
1815 status = nvme_check_zone_state_for_write(zone);
1816 if (status) {
1817 return status;
1818 }
1819
1820 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1821 uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
1822
1823 if (slba < zone->w_ptr || slba + nlb > ezrwa) {
1824 trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
1825 return NVME_ZONE_INVALID_WRITE;
1826 }
1827 } else {
1828 if (unlikely(slba != zone->w_ptr)) {
1829 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
1830 zone->w_ptr);
1831 return NVME_ZONE_INVALID_WRITE;
1832 }
1833 }
1834
1835 if (unlikely((slba + nlb) > zcap)) {
1836 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1837 return NVME_ZONE_BOUNDARY_ERROR;
1838 }
1839
1840 return NVME_SUCCESS;
1841 }
1842
nvme_check_zone_state_for_read(NvmeZone * zone)1843 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1844 {
1845 switch (nvme_get_zone_state(zone)) {
1846 case NVME_ZONE_STATE_EMPTY:
1847 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1848 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1849 case NVME_ZONE_STATE_FULL:
1850 case NVME_ZONE_STATE_CLOSED:
1851 case NVME_ZONE_STATE_READ_ONLY:
1852 return NVME_SUCCESS;
1853 case NVME_ZONE_STATE_OFFLINE:
1854 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1855 return NVME_ZONE_OFFLINE;
1856 default:
1857 g_assert_not_reached();
1858 }
1859
1860 return NVME_INTERNAL_DEV_ERROR;
1861 }
1862
nvme_check_zone_read(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1863 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1864 uint32_t nlb)
1865 {
1866 NvmeZone *zone;
1867 uint64_t bndry, end;
1868 uint16_t status;
1869
1870 zone = nvme_get_zone_by_slba(ns, slba);
1871 assert(zone);
1872
1873 bndry = nvme_zone_rd_boundary(ns, zone);
1874 end = slba + nlb;
1875
1876 status = nvme_check_zone_state_for_read(zone);
1877 if (status) {
1878 ;
1879 } else if (unlikely(end > bndry)) {
1880 if (!ns->params.cross_zone_read) {
1881 status = NVME_ZONE_BOUNDARY_ERROR;
1882 } else {
1883 /*
1884 * Read across zone boundary - check that all subsequent
1885 * zones that are being read have an appropriate state.
1886 */
1887 do {
1888 zone++;
1889 status = nvme_check_zone_state_for_read(zone);
1890 if (status) {
1891 break;
1892 }
1893 } while (end > nvme_zone_rd_boundary(ns, zone));
1894 }
1895 }
1896
1897 return status;
1898 }
1899
nvme_zrm_finish(NvmeNamespace * ns,NvmeZone * zone)1900 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1901 {
1902 switch (nvme_get_zone_state(zone)) {
1903 case NVME_ZONE_STATE_FULL:
1904 return NVME_SUCCESS;
1905
1906 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1907 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1908 nvme_aor_dec_open(ns);
1909 /* fallthrough */
1910 case NVME_ZONE_STATE_CLOSED:
1911 nvme_aor_dec_active(ns);
1912
1913 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1914 zone->d.za &= ~NVME_ZA_ZRWA_VALID;
1915 if (ns->params.numzrwa) {
1916 ns->zns.numzrwa++;
1917 }
1918 }
1919
1920 /* fallthrough */
1921 case NVME_ZONE_STATE_EMPTY:
1922 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1923 return NVME_SUCCESS;
1924
1925 default:
1926 return NVME_ZONE_INVAL_TRANSITION;
1927 }
1928 }
1929
nvme_zrm_close(NvmeNamespace * ns,NvmeZone * zone)1930 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1931 {
1932 switch (nvme_get_zone_state(zone)) {
1933 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1934 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1935 nvme_aor_dec_open(ns);
1936 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1937 /* fall through */
1938 case NVME_ZONE_STATE_CLOSED:
1939 return NVME_SUCCESS;
1940
1941 default:
1942 return NVME_ZONE_INVAL_TRANSITION;
1943 }
1944 }
1945
nvme_zrm_reset(NvmeNamespace * ns,NvmeZone * zone)1946 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1947 {
1948 switch (nvme_get_zone_state(zone)) {
1949 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1950 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1951 nvme_aor_dec_open(ns);
1952 /* fallthrough */
1953 case NVME_ZONE_STATE_CLOSED:
1954 nvme_aor_dec_active(ns);
1955
1956 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1957 if (ns->params.numzrwa) {
1958 ns->zns.numzrwa++;
1959 }
1960 }
1961
1962 /* fallthrough */
1963 case NVME_ZONE_STATE_FULL:
1964 zone->w_ptr = zone->d.zslba;
1965 zone->d.wp = zone->w_ptr;
1966 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1967 /* fallthrough */
1968 case NVME_ZONE_STATE_EMPTY:
1969 return NVME_SUCCESS;
1970
1971 default:
1972 return NVME_ZONE_INVAL_TRANSITION;
1973 }
1974 }
1975
nvme_zrm_auto_transition_zone(NvmeNamespace * ns)1976 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1977 {
1978 NvmeZone *zone;
1979
1980 if (ns->params.max_open_zones &&
1981 ns->nr_open_zones == ns->params.max_open_zones) {
1982 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1983 if (zone) {
1984 /*
1985 * Automatically close this implicitly open zone.
1986 */
1987 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1988 nvme_zrm_close(ns, zone);
1989 }
1990 }
1991 }
1992
1993 enum {
1994 NVME_ZRM_AUTO = 1 << 0,
1995 NVME_ZRM_ZRWA = 1 << 1,
1996 };
1997
nvme_zrm_open_flags(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone,int flags)1998 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1999 NvmeZone *zone, int flags)
2000 {
2001 int act = 0;
2002 uint16_t status;
2003
2004 switch (nvme_get_zone_state(zone)) {
2005 case NVME_ZONE_STATE_EMPTY:
2006 act = 1;
2007
2008 /* fallthrough */
2009
2010 case NVME_ZONE_STATE_CLOSED:
2011 if (n->params.auto_transition_zones) {
2012 nvme_zrm_auto_transition_zone(ns);
2013 }
2014 status = nvme_zns_check_resources(ns, act, 1,
2015 (flags & NVME_ZRM_ZRWA) ? 1 : 0);
2016 if (status) {
2017 return status;
2018 }
2019
2020 if (act) {
2021 nvme_aor_inc_active(ns);
2022 }
2023
2024 nvme_aor_inc_open(ns);
2025
2026 if (flags & NVME_ZRM_AUTO) {
2027 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
2028 return NVME_SUCCESS;
2029 }
2030
2031 /* fallthrough */
2032
2033 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2034 if (flags & NVME_ZRM_AUTO) {
2035 return NVME_SUCCESS;
2036 }
2037
2038 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
2039
2040 /* fallthrough */
2041
2042 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2043 if (flags & NVME_ZRM_ZRWA) {
2044 ns->zns.numzrwa--;
2045
2046 zone->d.za |= NVME_ZA_ZRWA_VALID;
2047 }
2048
2049 return NVME_SUCCESS;
2050
2051 default:
2052 return NVME_ZONE_INVAL_TRANSITION;
2053 }
2054 }
2055
nvme_zrm_auto(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone)2056 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
2057 NvmeZone *zone)
2058 {
2059 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
2060 }
2061
nvme_advance_zone_wp(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlb)2062 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
2063 uint32_t nlb)
2064 {
2065 zone->d.wp += nlb;
2066
2067 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
2068 nvme_zrm_finish(ns, zone);
2069 }
2070 }
2071
nvme_zoned_zrwa_implicit_flush(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlbc)2072 static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone,
2073 uint32_t nlbc)
2074 {
2075 uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg);
2076
2077 nlbc = nzrwafgs * ns->zns.zrwafg;
2078
2079 trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc);
2080
2081 zone->w_ptr += nlbc;
2082
2083 nvme_advance_zone_wp(ns, zone, nlbc);
2084 }
2085
nvme_finalize_zoned_write(NvmeNamespace * ns,NvmeRequest * req)2086 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
2087 {
2088 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2089 NvmeZone *zone;
2090 uint64_t slba;
2091 uint32_t nlb;
2092
2093 slba = le64_to_cpu(rw->slba);
2094 nlb = le16_to_cpu(rw->nlb) + 1;
2095 zone = nvme_get_zone_by_slba(ns, slba);
2096 assert(zone);
2097
2098 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
2099 uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1;
2100 uint64_t elba = slba + nlb - 1;
2101
2102 if (elba > ezrwa) {
2103 nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa);
2104 }
2105
2106 return;
2107 }
2108
2109 nvme_advance_zone_wp(ns, zone, nlb);
2110 }
2111
nvme_is_write(NvmeRequest * req)2112 static inline bool nvme_is_write(NvmeRequest *req)
2113 {
2114 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2115
2116 return rw->opcode == NVME_CMD_WRITE ||
2117 rw->opcode == NVME_CMD_ZONE_APPEND ||
2118 rw->opcode == NVME_CMD_WRITE_ZEROES;
2119 }
2120
nvme_misc_cb(void * opaque,int ret)2121 static void nvme_misc_cb(void *opaque, int ret)
2122 {
2123 NvmeRequest *req = opaque;
2124 uint16_t cid = nvme_cid(req);
2125
2126 trace_pci_nvme_misc_cb(cid);
2127
2128 if (ret) {
2129 if (!req->status) {
2130 req->status = NVME_INTERNAL_DEV_ERROR;
2131 }
2132
2133 trace_pci_nvme_err_aio(cid, strerror(-ret), req->status);
2134 }
2135
2136 nvme_enqueue_req_completion(nvme_cq(req), req);
2137 }
2138
nvme_rw_complete_cb(void * opaque,int ret)2139 void nvme_rw_complete_cb(void *opaque, int ret)
2140 {
2141 NvmeRequest *req = opaque;
2142 NvmeNamespace *ns = req->ns;
2143 BlockBackend *blk = ns->blkconf.blk;
2144 BlockAcctCookie *acct = &req->acct;
2145 BlockAcctStats *stats = blk_get_stats(blk);
2146
2147 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
2148
2149 if (ret) {
2150 Error *err = NULL;
2151
2152 block_acct_failed(stats, acct);
2153
2154 switch (req->cmd.opcode) {
2155 case NVME_CMD_READ:
2156 req->status = NVME_UNRECOVERED_READ;
2157 break;
2158
2159 case NVME_CMD_WRITE:
2160 case NVME_CMD_WRITE_ZEROES:
2161 case NVME_CMD_ZONE_APPEND:
2162 req->status = NVME_WRITE_FAULT;
2163 break;
2164
2165 default:
2166 req->status = NVME_INTERNAL_DEV_ERROR;
2167 break;
2168 }
2169
2170 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status);
2171
2172 error_setg_errno(&err, -ret, "aio failed");
2173 error_report_err(err);
2174 } else {
2175 block_acct_done(stats, acct);
2176 }
2177
2178 if (ns->params.zoned && nvme_is_write(req)) {
2179 nvme_finalize_zoned_write(ns, req);
2180 }
2181
2182 nvme_enqueue_req_completion(nvme_cq(req), req);
2183 }
2184
nvme_rw_cb(void * opaque,int ret)2185 static void nvme_rw_cb(void *opaque, int ret)
2186 {
2187 NvmeRequest *req = opaque;
2188 NvmeNamespace *ns = req->ns;
2189
2190 BlockBackend *blk = ns->blkconf.blk;
2191
2192 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
2193
2194 if (ret) {
2195 goto out;
2196 }
2197
2198 if (ns->lbaf.ms) {
2199 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2200 uint64_t slba = le64_to_cpu(rw->slba);
2201 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2202 uint64_t offset = nvme_moff(ns, slba);
2203
2204 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
2205 size_t mlen = nvme_m2b(ns, nlb);
2206
2207 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
2208 BDRV_REQ_MAY_UNMAP,
2209 nvme_rw_complete_cb, req);
2210 return;
2211 }
2212
2213 if (nvme_ns_ext(ns) || req->cmd.mptr) {
2214 uint16_t status;
2215
2216 nvme_sg_unmap(&req->sg);
2217 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
2218 if (status) {
2219 ret = -EFAULT;
2220 goto out;
2221 }
2222
2223 if (req->cmd.opcode == NVME_CMD_READ) {
2224 return nvme_blk_read(blk, offset, 1, nvme_rw_complete_cb, req);
2225 }
2226
2227 return nvme_blk_write(blk, offset, 1, nvme_rw_complete_cb, req);
2228 }
2229 }
2230
2231 out:
2232 nvme_rw_complete_cb(req, ret);
2233 }
2234
nvme_verify_cb(void * opaque,int ret)2235 static void nvme_verify_cb(void *opaque, int ret)
2236 {
2237 NvmeBounceContext *ctx = opaque;
2238 NvmeRequest *req = ctx->req;
2239 NvmeNamespace *ns = req->ns;
2240 BlockBackend *blk = ns->blkconf.blk;
2241 BlockAcctCookie *acct = &req->acct;
2242 BlockAcctStats *stats = blk_get_stats(blk);
2243 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2244 uint64_t slba = le64_to_cpu(rw->slba);
2245 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2246 uint16_t apptag = le16_to_cpu(rw->apptag);
2247 uint16_t appmask = le16_to_cpu(rw->appmask);
2248 uint64_t reftag = le32_to_cpu(rw->reftag);
2249 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2250 uint16_t status;
2251
2252 reftag |= cdw3 << 32;
2253
2254 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
2255
2256 if (ret) {
2257 block_acct_failed(stats, acct);
2258 req->status = NVME_UNRECOVERED_READ;
2259
2260 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status);
2261
2262 goto out;
2263 }
2264
2265 block_acct_done(stats, acct);
2266
2267 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2268 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
2269 ctx->mdata.iov.size, slba);
2270 if (status) {
2271 req->status = status;
2272 goto out;
2273 }
2274
2275 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2276 ctx->mdata.bounce, ctx->mdata.iov.size,
2277 prinfo, slba, apptag, appmask, &reftag);
2278 }
2279
2280 out:
2281 qemu_iovec_destroy(&ctx->data.iov);
2282 g_free(ctx->data.bounce);
2283
2284 qemu_iovec_destroy(&ctx->mdata.iov);
2285 g_free(ctx->mdata.bounce);
2286
2287 g_free(ctx);
2288
2289 nvme_enqueue_req_completion(nvme_cq(req), req);
2290 }
2291
2292
nvme_verify_mdata_in_cb(void * opaque,int ret)2293 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2294 {
2295 NvmeBounceContext *ctx = opaque;
2296 NvmeRequest *req = ctx->req;
2297 NvmeNamespace *ns = req->ns;
2298 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2299 uint64_t slba = le64_to_cpu(rw->slba);
2300 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2301 size_t mlen = nvme_m2b(ns, nlb);
2302 uint64_t offset = nvme_moff(ns, slba);
2303 BlockBackend *blk = ns->blkconf.blk;
2304
2305 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2306
2307 if (ret) {
2308 goto out;
2309 }
2310
2311 ctx->mdata.bounce = g_malloc(mlen);
2312
2313 qemu_iovec_reset(&ctx->mdata.iov);
2314 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2315
2316 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2317 nvme_verify_cb, ctx);
2318 return;
2319
2320 out:
2321 nvme_verify_cb(ctx, ret);
2322 }
2323
2324 struct nvme_compare_ctx {
2325 struct {
2326 QEMUIOVector iov;
2327 uint8_t *bounce;
2328 } data;
2329
2330 struct {
2331 QEMUIOVector iov;
2332 uint8_t *bounce;
2333 } mdata;
2334 };
2335
nvme_compare_mdata_cb(void * opaque,int ret)2336 static void nvme_compare_mdata_cb(void *opaque, int ret)
2337 {
2338 NvmeRequest *req = opaque;
2339 NvmeNamespace *ns = req->ns;
2340 NvmeCtrl *n = nvme_ctrl(req);
2341 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2342 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2343 uint16_t apptag = le16_to_cpu(rw->apptag);
2344 uint16_t appmask = le16_to_cpu(rw->appmask);
2345 uint64_t reftag = le32_to_cpu(rw->reftag);
2346 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2347 struct nvme_compare_ctx *ctx = req->opaque;
2348 g_autofree uint8_t *buf = NULL;
2349 BlockBackend *blk = ns->blkconf.blk;
2350 BlockAcctCookie *acct = &req->acct;
2351 BlockAcctStats *stats = blk_get_stats(blk);
2352 uint16_t status = NVME_SUCCESS;
2353
2354 reftag |= cdw3 << 32;
2355
2356 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2357
2358 if (ret) {
2359 block_acct_failed(stats, acct);
2360 req->status = NVME_UNRECOVERED_READ;
2361
2362 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status);
2363
2364 goto out;
2365 }
2366
2367 buf = g_malloc(ctx->mdata.iov.size);
2368
2369 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2370 NVME_TX_DIRECTION_TO_DEVICE, req);
2371 if (status) {
2372 req->status = status;
2373 goto out;
2374 }
2375
2376 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2377 uint64_t slba = le64_to_cpu(rw->slba);
2378 uint8_t *bufp;
2379 uint8_t *mbufp = ctx->mdata.bounce;
2380 uint8_t *end = mbufp + ctx->mdata.iov.size;
2381 int16_t pil = 0;
2382
2383 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2384 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2385 slba, apptag, appmask, &reftag);
2386 if (status) {
2387 req->status = status;
2388 goto out;
2389 }
2390
2391 /*
2392 * When formatted with protection information, do not compare the DIF
2393 * tuple.
2394 */
2395 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2396 pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
2397 }
2398
2399 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2400 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2401 req->status = NVME_CMP_FAILURE | NVME_DNR;
2402 goto out;
2403 }
2404 }
2405
2406 goto out;
2407 }
2408
2409 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2410 req->status = NVME_CMP_FAILURE | NVME_DNR;
2411 goto out;
2412 }
2413
2414 block_acct_done(stats, acct);
2415
2416 out:
2417 qemu_iovec_destroy(&ctx->data.iov);
2418 g_free(ctx->data.bounce);
2419
2420 qemu_iovec_destroy(&ctx->mdata.iov);
2421 g_free(ctx->mdata.bounce);
2422
2423 g_free(ctx);
2424
2425 nvme_enqueue_req_completion(nvme_cq(req), req);
2426 }
2427
nvme_compare_data_cb(void * opaque,int ret)2428 static void nvme_compare_data_cb(void *opaque, int ret)
2429 {
2430 NvmeRequest *req = opaque;
2431 NvmeCtrl *n = nvme_ctrl(req);
2432 NvmeNamespace *ns = req->ns;
2433 BlockBackend *blk = ns->blkconf.blk;
2434 BlockAcctCookie *acct = &req->acct;
2435 BlockAcctStats *stats = blk_get_stats(blk);
2436
2437 struct nvme_compare_ctx *ctx = req->opaque;
2438 g_autofree uint8_t *buf = NULL;
2439 uint16_t status;
2440
2441 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2442
2443 if (ret) {
2444 block_acct_failed(stats, acct);
2445 req->status = NVME_UNRECOVERED_READ;
2446
2447 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status);
2448
2449 goto out;
2450 }
2451
2452 buf = g_malloc(ctx->data.iov.size);
2453
2454 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2455 NVME_TX_DIRECTION_TO_DEVICE, req);
2456 if (status) {
2457 req->status = status;
2458 goto out;
2459 }
2460
2461 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2462 req->status = NVME_CMP_FAILURE | NVME_DNR;
2463 goto out;
2464 }
2465
2466 if (ns->lbaf.ms) {
2467 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2468 uint64_t slba = le64_to_cpu(rw->slba);
2469 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2470 size_t mlen = nvme_m2b(ns, nlb);
2471 uint64_t offset = nvme_moff(ns, slba);
2472
2473 ctx->mdata.bounce = g_malloc(mlen);
2474
2475 qemu_iovec_init(&ctx->mdata.iov, 1);
2476 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2477
2478 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2479 nvme_compare_mdata_cb, req);
2480 return;
2481 }
2482
2483 block_acct_done(stats, acct);
2484
2485 out:
2486 qemu_iovec_destroy(&ctx->data.iov);
2487 g_free(ctx->data.bounce);
2488 g_free(ctx);
2489
2490 nvme_enqueue_req_completion(nvme_cq(req), req);
2491 }
2492
2493 typedef struct NvmeDSMAIOCB {
2494 BlockAIOCB common;
2495 BlockAIOCB *aiocb;
2496 NvmeRequest *req;
2497 int ret;
2498
2499 NvmeDsmRange *range;
2500 unsigned int nr;
2501 unsigned int idx;
2502 } NvmeDSMAIOCB;
2503
nvme_dsm_cancel(BlockAIOCB * aiocb)2504 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2505 {
2506 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2507
2508 /* break nvme_dsm_cb loop */
2509 iocb->idx = iocb->nr;
2510 iocb->ret = -ECANCELED;
2511
2512 if (iocb->aiocb) {
2513 blk_aio_cancel_async(iocb->aiocb);
2514 iocb->aiocb = NULL;
2515 } else {
2516 /*
2517 * We only reach this if nvme_dsm_cancel() has already been called or
2518 * the command ran to completion.
2519 */
2520 assert(iocb->idx == iocb->nr);
2521 }
2522 }
2523
2524 static const AIOCBInfo nvme_dsm_aiocb_info = {
2525 .aiocb_size = sizeof(NvmeDSMAIOCB),
2526 .cancel_async = nvme_dsm_cancel,
2527 };
2528
2529 static void nvme_dsm_cb(void *opaque, int ret);
2530
nvme_dsm_md_cb(void * opaque,int ret)2531 static void nvme_dsm_md_cb(void *opaque, int ret)
2532 {
2533 NvmeDSMAIOCB *iocb = opaque;
2534 NvmeRequest *req = iocb->req;
2535 NvmeNamespace *ns = req->ns;
2536 NvmeDsmRange *range;
2537 uint64_t slba;
2538 uint32_t nlb;
2539
2540 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
2541 goto done;
2542 }
2543
2544 range = &iocb->range[iocb->idx - 1];
2545 slba = le64_to_cpu(range->slba);
2546 nlb = le32_to_cpu(range->nlb);
2547
2548 /*
2549 * Check that all block were discarded (zeroed); otherwise we do not zero
2550 * the metadata.
2551 */
2552
2553 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2554 if (ret) {
2555 if (ret < 0) {
2556 goto done;
2557 }
2558
2559 nvme_dsm_cb(iocb, 0);
2560 return;
2561 }
2562
2563 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2564 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2565 nvme_dsm_cb, iocb);
2566 return;
2567
2568 done:
2569 nvme_dsm_cb(iocb, ret);
2570 }
2571
nvme_dsm_cb(void * opaque,int ret)2572 static void nvme_dsm_cb(void *opaque, int ret)
2573 {
2574 NvmeDSMAIOCB *iocb = opaque;
2575 NvmeRequest *req = iocb->req;
2576 NvmeCtrl *n = nvme_ctrl(req);
2577 NvmeNamespace *ns = req->ns;
2578 NvmeDsmRange *range;
2579 uint64_t slba;
2580 uint32_t nlb;
2581
2582 if (iocb->ret < 0) {
2583 goto done;
2584 } else if (ret < 0) {
2585 iocb->ret = ret;
2586 goto done;
2587 }
2588
2589 next:
2590 if (iocb->idx == iocb->nr) {
2591 goto done;
2592 }
2593
2594 range = &iocb->range[iocb->idx++];
2595 slba = le64_to_cpu(range->slba);
2596 nlb = le32_to_cpu(range->nlb);
2597
2598 trace_pci_nvme_dsm_deallocate(slba, nlb);
2599
2600 if (nlb > n->dmrsl) {
2601 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2602 goto next;
2603 }
2604
2605 if (nvme_check_bounds(ns, slba, nlb)) {
2606 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2607 ns->id_ns.nsze);
2608 goto next;
2609 }
2610
2611 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2612 nvme_l2b(ns, nlb),
2613 nvme_dsm_md_cb, iocb);
2614 return;
2615
2616 done:
2617 iocb->aiocb = NULL;
2618 iocb->common.cb(iocb->common.opaque, iocb->ret);
2619 g_free(iocb->range);
2620 qemu_aio_unref(iocb);
2621 }
2622
nvme_dsm(NvmeCtrl * n,NvmeRequest * req)2623 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2624 {
2625 NvmeNamespace *ns = req->ns;
2626 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2627 uint32_t attr = le32_to_cpu(dsm->attributes);
2628 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2629 uint16_t status = NVME_SUCCESS;
2630
2631 trace_pci_nvme_dsm(nr, attr);
2632
2633 if (attr & NVME_DSMGMT_AD) {
2634 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2635 nvme_misc_cb, req);
2636
2637 iocb->req = req;
2638 iocb->ret = 0;
2639 iocb->range = g_new(NvmeDsmRange, nr);
2640 iocb->nr = nr;
2641 iocb->idx = 0;
2642
2643 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2644 req);
2645 if (status) {
2646 g_free(iocb->range);
2647 qemu_aio_unref(iocb);
2648
2649 return status;
2650 }
2651
2652 req->aiocb = &iocb->common;
2653 nvme_dsm_cb(iocb, 0);
2654
2655 return NVME_NO_COMPLETE;
2656 }
2657
2658 return status;
2659 }
2660
nvme_verify(NvmeCtrl * n,NvmeRequest * req)2661 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2662 {
2663 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2664 NvmeNamespace *ns = req->ns;
2665 BlockBackend *blk = ns->blkconf.blk;
2666 uint64_t slba = le64_to_cpu(rw->slba);
2667 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2668 size_t len = nvme_l2b(ns, nlb);
2669 size_t data_len = len;
2670 int64_t offset = nvme_l2b(ns, slba);
2671 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2672 uint32_t reftag = le32_to_cpu(rw->reftag);
2673 NvmeBounceContext *ctx = NULL;
2674 uint16_t status;
2675
2676 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2677
2678 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2679 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2680 if (status) {
2681 return status;
2682 }
2683
2684 if (prinfo & NVME_PRINFO_PRACT) {
2685 return NVME_INVALID_PROT_INFO | NVME_DNR;
2686 }
2687 }
2688
2689 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
2690 data_len += nvme_m2b(ns, nlb);
2691 }
2692
2693 if (data_len > (n->page_size << n->params.vsl)) {
2694 return NVME_INVALID_FIELD | NVME_DNR;
2695 }
2696
2697 status = nvme_check_bounds(ns, slba, nlb);
2698 if (status) {
2699 return status;
2700 }
2701
2702 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2703 status = nvme_check_dulbe(ns, slba, nlb);
2704 if (status) {
2705 return status;
2706 }
2707 }
2708
2709 ctx = g_new0(NvmeBounceContext, 1);
2710 ctx->req = req;
2711
2712 ctx->data.bounce = g_malloc(len);
2713
2714 qemu_iovec_init(&ctx->data.iov, 1);
2715 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2716
2717 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2718 BLOCK_ACCT_READ);
2719
2720 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2721 nvme_verify_mdata_in_cb, ctx);
2722 return NVME_NO_COMPLETE;
2723 }
2724
2725 typedef struct NvmeCopyAIOCB {
2726 BlockAIOCB common;
2727 BlockAIOCB *aiocb;
2728 NvmeRequest *req;
2729 NvmeCtrl *n;
2730 int ret;
2731
2732 void *ranges;
2733 unsigned int format;
2734 int nr;
2735 int idx;
2736
2737 uint8_t *bounce;
2738 QEMUIOVector iov;
2739 struct {
2740 BlockAcctCookie read;
2741 BlockAcctCookie write;
2742 } acct;
2743
2744 uint64_t reftag;
2745 uint64_t slba;
2746
2747 NvmeZone *zone;
2748 NvmeNamespace *sns;
2749 uint32_t tcl;
2750 } NvmeCopyAIOCB;
2751
nvme_copy_cancel(BlockAIOCB * aiocb)2752 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2753 {
2754 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2755
2756 iocb->ret = -ECANCELED;
2757
2758 if (iocb->aiocb) {
2759 blk_aio_cancel_async(iocb->aiocb);
2760 iocb->aiocb = NULL;
2761 }
2762 }
2763
2764 static const AIOCBInfo nvme_copy_aiocb_info = {
2765 .aiocb_size = sizeof(NvmeCopyAIOCB),
2766 .cancel_async = nvme_copy_cancel,
2767 };
2768
nvme_copy_done(NvmeCopyAIOCB * iocb)2769 static void nvme_copy_done(NvmeCopyAIOCB *iocb)
2770 {
2771 NvmeRequest *req = iocb->req;
2772 NvmeNamespace *ns = req->ns;
2773 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2774
2775 if (iocb->idx != iocb->nr) {
2776 req->cqe.result = cpu_to_le32(iocb->idx);
2777 }
2778
2779 qemu_iovec_destroy(&iocb->iov);
2780 g_free(iocb->bounce);
2781
2782 if (iocb->ret < 0) {
2783 block_acct_failed(stats, &iocb->acct.read);
2784 block_acct_failed(stats, &iocb->acct.write);
2785 } else {
2786 block_acct_done(stats, &iocb->acct.read);
2787 block_acct_done(stats, &iocb->acct.write);
2788 }
2789
2790 iocb->common.cb(iocb->common.opaque, iocb->ret);
2791 qemu_aio_unref(iocb);
2792 }
2793
2794 static void nvme_do_copy(NvmeCopyAIOCB *iocb);
2795
nvme_copy_source_range_parse_format0_2(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2796 static void nvme_copy_source_range_parse_format0_2(void *ranges,
2797 int idx, uint64_t *slba,
2798 uint32_t *nlb,
2799 uint32_t *snsid,
2800 uint16_t *apptag,
2801 uint16_t *appmask,
2802 uint64_t *reftag)
2803 {
2804 NvmeCopySourceRangeFormat0_2 *_ranges = ranges;
2805
2806 if (snsid) {
2807 *snsid = le32_to_cpu(_ranges[idx].sparams);
2808 }
2809
2810 if (slba) {
2811 *slba = le64_to_cpu(_ranges[idx].slba);
2812 }
2813
2814 if (nlb) {
2815 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2816 }
2817
2818 if (apptag) {
2819 *apptag = le16_to_cpu(_ranges[idx].apptag);
2820 }
2821
2822 if (appmask) {
2823 *appmask = le16_to_cpu(_ranges[idx].appmask);
2824 }
2825
2826 if (reftag) {
2827 *reftag = le32_to_cpu(_ranges[idx].reftag);
2828 }
2829 }
2830
nvme_copy_source_range_parse_format1_3(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2831 static void nvme_copy_source_range_parse_format1_3(void *ranges, int idx,
2832 uint64_t *slba,
2833 uint32_t *nlb,
2834 uint32_t *snsid,
2835 uint16_t *apptag,
2836 uint16_t *appmask,
2837 uint64_t *reftag)
2838 {
2839 NvmeCopySourceRangeFormat1_3 *_ranges = ranges;
2840
2841 if (snsid) {
2842 *snsid = le32_to_cpu(_ranges[idx].sparams);
2843 }
2844
2845 if (slba) {
2846 *slba = le64_to_cpu(_ranges[idx].slba);
2847 }
2848
2849 if (nlb) {
2850 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2851 }
2852
2853 if (apptag) {
2854 *apptag = le16_to_cpu(_ranges[idx].apptag);
2855 }
2856
2857 if (appmask) {
2858 *appmask = le16_to_cpu(_ranges[idx].appmask);
2859 }
2860
2861 if (reftag) {
2862 *reftag = 0;
2863
2864 *reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
2865 *reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
2866 *reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
2867 *reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
2868 *reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
2869 *reftag |= (uint64_t)_ranges[idx].sr[9];
2870 }
2871 }
2872
nvme_copy_source_range_parse(void * ranges,int idx,uint8_t format,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2873 static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
2874 uint64_t *slba, uint32_t *nlb,
2875 uint32_t *snsid, uint16_t *apptag,
2876 uint16_t *appmask, uint64_t *reftag)
2877 {
2878 switch (format) {
2879 case NVME_COPY_FORMAT_0:
2880 case NVME_COPY_FORMAT_2:
2881 nvme_copy_source_range_parse_format0_2(ranges, idx, slba, nlb, snsid,
2882 apptag, appmask, reftag);
2883 break;
2884
2885 case NVME_COPY_FORMAT_1:
2886 case NVME_COPY_FORMAT_3:
2887 nvme_copy_source_range_parse_format1_3(ranges, idx, slba, nlb, snsid,
2888 apptag, appmask, reftag);
2889 break;
2890
2891 default:
2892 abort();
2893 }
2894 }
2895
nvme_check_copy_mcl(NvmeNamespace * ns,NvmeCopyAIOCB * iocb,uint16_t nr)2896 static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
2897 NvmeCopyAIOCB *iocb, uint16_t nr)
2898 {
2899 uint32_t copy_len = 0;
2900
2901 for (int idx = 0; idx < nr; idx++) {
2902 uint32_t nlb;
2903 nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
2904 &nlb, NULL, NULL, NULL, NULL);
2905 copy_len += nlb;
2906 }
2907 iocb->tcl = copy_len;
2908 if (copy_len > ns->id_ns.mcl) {
2909 return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2910 }
2911
2912 return NVME_SUCCESS;
2913 }
2914
nvme_copy_out_completed_cb(void * opaque,int ret)2915 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2916 {
2917 NvmeCopyAIOCB *iocb = opaque;
2918 NvmeRequest *req = iocb->req;
2919 NvmeNamespace *dns = req->ns;
2920 uint32_t nlb;
2921
2922 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2923 &nlb, NULL, NULL, NULL, NULL);
2924
2925 if (ret < 0) {
2926 iocb->ret = ret;
2927 req->status = NVME_WRITE_FAULT;
2928 goto out;
2929 } else if (iocb->ret < 0) {
2930 goto out;
2931 }
2932
2933 if (dns->params.zoned) {
2934 nvme_advance_zone_wp(dns, iocb->zone, nlb);
2935 }
2936
2937 iocb->idx++;
2938 iocb->slba += nlb;
2939 out:
2940 nvme_do_copy(iocb);
2941 }
2942
nvme_copy_out_cb(void * opaque,int ret)2943 static void nvme_copy_out_cb(void *opaque, int ret)
2944 {
2945 NvmeCopyAIOCB *iocb = opaque;
2946 NvmeRequest *req = iocb->req;
2947 NvmeNamespace *dns = req->ns;
2948 uint32_t nlb;
2949 size_t mlen;
2950 uint8_t *mbounce;
2951
2952 if (ret < 0 || iocb->ret < 0 || !dns->lbaf.ms) {
2953 goto out;
2954 }
2955
2956 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2957 &nlb, NULL, NULL, NULL, NULL);
2958
2959 mlen = nvme_m2b(dns, nlb);
2960 mbounce = iocb->bounce + nvme_l2b(dns, nlb);
2961
2962 qemu_iovec_reset(&iocb->iov);
2963 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2964
2965 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_moff(dns, iocb->slba),
2966 &iocb->iov, 0, nvme_copy_out_completed_cb,
2967 iocb);
2968
2969 return;
2970
2971 out:
2972 nvme_copy_out_completed_cb(iocb, ret);
2973 }
2974
nvme_copy_in_completed_cb(void * opaque,int ret)2975 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2976 {
2977 NvmeCopyAIOCB *iocb = opaque;
2978 NvmeRequest *req = iocb->req;
2979 NvmeNamespace *sns = iocb->sns;
2980 NvmeNamespace *dns = req->ns;
2981 NvmeCopyCmd *copy = NULL;
2982 uint8_t *mbounce = NULL;
2983 uint32_t nlb;
2984 uint64_t slba;
2985 uint16_t apptag, appmask;
2986 uint64_t reftag;
2987 size_t len, mlen;
2988 uint16_t status;
2989
2990 if (ret < 0) {
2991 iocb->ret = ret;
2992 req->status = NVME_UNRECOVERED_READ;
2993 goto out;
2994 } else if (iocb->ret < 0) {
2995 goto out;
2996 }
2997
2998 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2999 &nlb, NULL, &apptag, &appmask, &reftag);
3000
3001 trace_pci_nvme_copy_out(iocb->slba, nlb);
3002
3003 len = nvme_l2b(sns, nlb);
3004
3005 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps)) {
3006 copy = (NvmeCopyCmd *)&req->cmd;
3007
3008 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3009
3010 mlen = nvme_m2b(sns, nlb);
3011 mbounce = iocb->bounce + nvme_l2b(sns, nlb);
3012
3013 status = nvme_dif_mangle_mdata(sns, mbounce, mlen, slba);
3014 if (status) {
3015 goto invalid;
3016 }
3017 status = nvme_dif_check(sns, iocb->bounce, len, mbounce, mlen, prinfor,
3018 slba, apptag, appmask, &reftag);
3019 if (status) {
3020 goto invalid;
3021 }
3022 }
3023
3024 if (NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3025 copy = (NvmeCopyCmd *)&req->cmd;
3026 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3027
3028 mlen = nvme_m2b(dns, nlb);
3029 mbounce = iocb->bounce + nvme_l2b(dns, nlb);
3030
3031 apptag = le16_to_cpu(copy->apptag);
3032 appmask = le16_to_cpu(copy->appmask);
3033
3034 if (prinfow & NVME_PRINFO_PRACT) {
3035 status = nvme_check_prinfo(dns, prinfow, iocb->slba, iocb->reftag);
3036 if (status) {
3037 goto invalid;
3038 }
3039
3040 nvme_dif_pract_generate_dif(dns, iocb->bounce, len, mbounce, mlen,
3041 apptag, &iocb->reftag);
3042 } else {
3043 status = nvme_dif_check(dns, iocb->bounce, len, mbounce, mlen,
3044 prinfow, iocb->slba, apptag, appmask,
3045 &iocb->reftag);
3046 if (status) {
3047 goto invalid;
3048 }
3049 }
3050 }
3051
3052 status = nvme_check_bounds(dns, iocb->slba, nlb);
3053 if (status) {
3054 goto invalid;
3055 }
3056
3057 if (dns->params.zoned) {
3058 status = nvme_check_zone_write(dns, iocb->zone, iocb->slba, nlb);
3059 if (status) {
3060 goto invalid;
3061 }
3062
3063 if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) {
3064 iocb->zone->w_ptr += nlb;
3065 }
3066 }
3067
3068 qemu_iovec_reset(&iocb->iov);
3069 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3070
3071 block_acct_start(blk_get_stats(dns->blkconf.blk), &iocb->acct.write, 0,
3072 BLOCK_ACCT_WRITE);
3073
3074 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_l2b(dns, iocb->slba),
3075 &iocb->iov, 0, nvme_copy_out_cb, iocb);
3076
3077 return;
3078
3079 invalid:
3080 req->status = status;
3081 iocb->ret = -1;
3082 out:
3083 nvme_do_copy(iocb);
3084 }
3085
nvme_copy_in_cb(void * opaque,int ret)3086 static void nvme_copy_in_cb(void *opaque, int ret)
3087 {
3088 NvmeCopyAIOCB *iocb = opaque;
3089 NvmeNamespace *sns = iocb->sns;
3090 uint64_t slba;
3091 uint32_t nlb;
3092
3093 if (ret < 0 || iocb->ret < 0 || !sns->lbaf.ms) {
3094 goto out;
3095 }
3096
3097 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3098 &nlb, NULL, NULL, NULL, NULL);
3099
3100 qemu_iovec_reset(&iocb->iov);
3101 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(sns, nlb),
3102 nvme_m2b(sns, nlb));
3103
3104 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_moff(sns, slba),
3105 &iocb->iov, 0, nvme_copy_in_completed_cb,
3106 iocb);
3107 return;
3108
3109 out:
3110 nvme_copy_in_completed_cb(iocb, ret);
3111 }
3112
nvme_csi_supports_copy(uint8_t csi)3113 static inline bool nvme_csi_supports_copy(uint8_t csi)
3114 {
3115 return csi == NVME_CSI_NVM || csi == NVME_CSI_ZONED;
3116 }
3117
nvme_copy_ns_format_match(NvmeNamespace * sns,NvmeNamespace * dns)3118 static inline bool nvme_copy_ns_format_match(NvmeNamespace *sns,
3119 NvmeNamespace *dns)
3120 {
3121 return sns->lbaf.ds == dns->lbaf.ds && sns->lbaf.ms == dns->lbaf.ms;
3122 }
3123
nvme_copy_matching_ns_format(NvmeNamespace * sns,NvmeNamespace * dns,bool pi_enable)3124 static bool nvme_copy_matching_ns_format(NvmeNamespace *sns, NvmeNamespace *dns,
3125 bool pi_enable)
3126 {
3127 if (!nvme_csi_supports_copy(sns->csi) ||
3128 !nvme_csi_supports_copy(dns->csi)) {
3129 return false;
3130 }
3131
3132 if (!pi_enable && !nvme_copy_ns_format_match(sns, dns)) {
3133 return false;
3134 }
3135
3136 if (pi_enable && (!nvme_copy_ns_format_match(sns, dns) ||
3137 sns->id_ns.dps != dns->id_ns.dps)) {
3138 return false;
3139 }
3140
3141 return true;
3142 }
3143
nvme_copy_corresp_pi_match(NvmeNamespace * sns,NvmeNamespace * dns)3144 static inline bool nvme_copy_corresp_pi_match(NvmeNamespace *sns,
3145 NvmeNamespace *dns)
3146 {
3147 return sns->lbaf.ms == 0 &&
3148 ((dns->lbaf.ms == 8 && dns->pif == 0) ||
3149 (dns->lbaf.ms == 16 && dns->pif == 1));
3150 }
3151
nvme_copy_corresp_pi_format(NvmeNamespace * sns,NvmeNamespace * dns,bool sns_pi_en)3152 static bool nvme_copy_corresp_pi_format(NvmeNamespace *sns, NvmeNamespace *dns,
3153 bool sns_pi_en)
3154 {
3155 if (!nvme_csi_supports_copy(sns->csi) ||
3156 !nvme_csi_supports_copy(dns->csi)) {
3157 return false;
3158 }
3159
3160 if (!sns_pi_en && !nvme_copy_corresp_pi_match(sns, dns)) {
3161 return false;
3162 }
3163
3164 if (sns_pi_en && !nvme_copy_corresp_pi_match(dns, sns)) {
3165 return false;
3166 }
3167
3168 return true;
3169 }
3170
nvme_do_copy(NvmeCopyAIOCB * iocb)3171 static void nvme_do_copy(NvmeCopyAIOCB *iocb)
3172 {
3173 NvmeRequest *req = iocb->req;
3174 NvmeNamespace *sns;
3175 NvmeNamespace *dns = req->ns;
3176 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3177 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3178 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3179 uint64_t slba;
3180 uint32_t nlb;
3181 size_t len;
3182 uint16_t status;
3183 uint32_t dnsid = le32_to_cpu(req->cmd.nsid);
3184 uint32_t snsid = dnsid;
3185
3186 if (iocb->ret < 0) {
3187 goto done;
3188 }
3189
3190 if (iocb->idx == iocb->nr) {
3191 goto done;
3192 }
3193
3194 if (iocb->format == 2 || iocb->format == 3) {
3195 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3196 &slba, &nlb, &snsid, NULL, NULL, NULL);
3197 if (snsid != dnsid) {
3198 if (snsid == NVME_NSID_BROADCAST ||
3199 !nvme_nsid_valid(iocb->n, snsid)) {
3200 status = NVME_INVALID_NSID | NVME_DNR;
3201 goto invalid;
3202 }
3203 iocb->sns = nvme_ns(iocb->n, snsid);
3204 if (unlikely(!iocb->sns)) {
3205 status = NVME_INVALID_FIELD | NVME_DNR;
3206 goto invalid;
3207 }
3208 } else {
3209 if (((slba + nlb) > iocb->slba) &&
3210 ((slba + nlb) < (iocb->slba + iocb->tcl))) {
3211 status = NVME_CMD_OVERLAP_IO_RANGE | NVME_DNR;
3212 goto invalid;
3213 }
3214 }
3215 } else {
3216 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3217 &slba, &nlb, NULL, NULL, NULL, NULL);
3218 }
3219
3220 sns = iocb->sns;
3221 if ((snsid == dnsid) && NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3222 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
3223 status = NVME_INVALID_FIELD | NVME_DNR;
3224 goto invalid;
3225 } else if (snsid != dnsid) {
3226 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3227 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3228 if (!nvme_copy_matching_ns_format(sns, dns, false)) {
3229 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3230 goto invalid;
3231 }
3232 }
3233 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3234 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3235 if ((prinfor & NVME_PRINFO_PRACT) !=
3236 (prinfow & NVME_PRINFO_PRACT)) {
3237 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3238 goto invalid;
3239 } else {
3240 if (!nvme_copy_matching_ns_format(sns, dns, true)) {
3241 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3242 goto invalid;
3243 }
3244 }
3245 }
3246
3247 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3248 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3249 if (!(prinfow & NVME_PRINFO_PRACT)) {
3250 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3251 goto invalid;
3252 } else {
3253 if (!nvme_copy_corresp_pi_format(sns, dns, false)) {
3254 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3255 goto invalid;
3256 }
3257 }
3258 }
3259
3260 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3261 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3262 if (!(prinfor & NVME_PRINFO_PRACT)) {
3263 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3264 goto invalid;
3265 } else {
3266 if (!nvme_copy_corresp_pi_format(sns, dns, true)) {
3267 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3268 goto invalid;
3269 }
3270 }
3271 }
3272 }
3273 len = nvme_l2b(sns, nlb);
3274
3275 trace_pci_nvme_copy_source_range(slba, nlb);
3276
3277 if (nlb > le16_to_cpu(sns->id_ns.mssrl)) {
3278 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3279 goto invalid;
3280 }
3281
3282 status = nvme_check_bounds(sns, slba, nlb);
3283 if (status) {
3284 goto invalid;
3285 }
3286
3287 if (NVME_ERR_REC_DULBE(sns->features.err_rec)) {
3288 status = nvme_check_dulbe(sns, slba, nlb);
3289 if (status) {
3290 goto invalid;
3291 }
3292 }
3293
3294 if (sns->params.zoned) {
3295 status = nvme_check_zone_read(sns, slba, nlb);
3296 if (status) {
3297 goto invalid;
3298 }
3299 }
3300
3301 g_free(iocb->bounce);
3302 iocb->bounce = g_malloc_n(le16_to_cpu(sns->id_ns.mssrl),
3303 sns->lbasz + sns->lbaf.ms);
3304
3305 qemu_iovec_reset(&iocb->iov);
3306 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3307
3308 block_acct_start(blk_get_stats(sns->blkconf.blk), &iocb->acct.read, 0,
3309 BLOCK_ACCT_READ);
3310
3311 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_l2b(sns, slba),
3312 &iocb->iov, 0, nvme_copy_in_cb, iocb);
3313 return;
3314
3315 invalid:
3316 req->status = status;
3317 iocb->ret = -1;
3318 done:
3319 nvme_copy_done(iocb);
3320 }
3321
nvme_copy(NvmeCtrl * n,NvmeRequest * req)3322 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
3323 {
3324 NvmeNamespace *ns = req->ns;
3325 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3326 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
3327 nvme_misc_cb, req);
3328 uint16_t nr = copy->nr + 1;
3329 uint8_t format = copy->control[0] & 0xf;
3330 size_t len = sizeof(NvmeCopySourceRangeFormat0_2);
3331
3332 uint16_t status;
3333
3334 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
3335
3336 iocb->ranges = NULL;
3337 iocb->zone = NULL;
3338
3339 if (!(n->id_ctrl.ocfs & (1 << format)) ||
3340 ((format == 2 || format == 3) &&
3341 !(n->features.hbs.cdfe & (1 << format)))) {
3342 trace_pci_nvme_err_copy_invalid_format(format);
3343 status = NVME_INVALID_FIELD | NVME_DNR;
3344 goto invalid;
3345 }
3346
3347 if (nr > ns->id_ns.msrc + 1) {
3348 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3349 goto invalid;
3350 }
3351
3352 if ((ns->pif == 0x0 && (format != 0x0 && format != 0x2)) ||
3353 (ns->pif != 0x0 && (format != 0x1 && format != 0x3))) {
3354 status = NVME_INVALID_FORMAT | NVME_DNR;
3355 goto invalid;
3356 }
3357
3358 if (ns->pif) {
3359 len = sizeof(NvmeCopySourceRangeFormat1_3);
3360 }
3361
3362 iocb->format = format;
3363 iocb->ranges = g_malloc_n(nr, len);
3364 status = nvme_h2c(n, (uint8_t *)iocb->ranges, len * nr, req);
3365 if (status) {
3366 goto invalid;
3367 }
3368
3369 iocb->slba = le64_to_cpu(copy->sdlba);
3370
3371 if (ns->params.zoned) {
3372 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
3373 if (!iocb->zone) {
3374 status = NVME_LBA_RANGE | NVME_DNR;
3375 goto invalid;
3376 }
3377
3378 status = nvme_zrm_auto(n, ns, iocb->zone);
3379 if (status) {
3380 goto invalid;
3381 }
3382 }
3383
3384 status = nvme_check_copy_mcl(ns, iocb, nr);
3385 if (status) {
3386 goto invalid;
3387 }
3388
3389 iocb->req = req;
3390 iocb->ret = 0;
3391 iocb->nr = nr;
3392 iocb->idx = 0;
3393 iocb->reftag = le32_to_cpu(copy->reftag);
3394 iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32;
3395
3396 qemu_iovec_init(&iocb->iov, 1);
3397
3398 req->aiocb = &iocb->common;
3399 iocb->sns = req->ns;
3400 iocb->n = n;
3401 iocb->bounce = NULL;
3402 nvme_do_copy(iocb);
3403
3404 return NVME_NO_COMPLETE;
3405
3406 invalid:
3407 g_free(iocb->ranges);
3408 qemu_aio_unref(iocb);
3409 return status;
3410 }
3411
nvme_compare(NvmeCtrl * n,NvmeRequest * req)3412 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
3413 {
3414 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3415 NvmeNamespace *ns = req->ns;
3416 BlockBackend *blk = ns->blkconf.blk;
3417 uint64_t slba = le64_to_cpu(rw->slba);
3418 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
3419 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3420 size_t data_len = nvme_l2b(ns, nlb);
3421 size_t len = data_len;
3422 int64_t offset = nvme_l2b(ns, slba);
3423 struct nvme_compare_ctx *ctx = NULL;
3424 uint16_t status;
3425
3426 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
3427
3428 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
3429 return NVME_INVALID_PROT_INFO | NVME_DNR;
3430 }
3431
3432 if (nvme_ns_ext(ns)) {
3433 len += nvme_m2b(ns, nlb);
3434 }
3435
3436 if (NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt)) {
3437 status = nvme_check_mdts(n, data_len);
3438 } else {
3439 status = nvme_check_mdts(n, len);
3440 }
3441 if (status) {
3442 return status;
3443 }
3444
3445 status = nvme_check_bounds(ns, slba, nlb);
3446 if (status) {
3447 return status;
3448 }
3449
3450 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3451 status = nvme_check_dulbe(ns, slba, nlb);
3452 if (status) {
3453 return status;
3454 }
3455 }
3456
3457 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
3458 if (status) {
3459 return status;
3460 }
3461
3462 ctx = g_new(struct nvme_compare_ctx, 1);
3463 ctx->data.bounce = g_malloc(data_len);
3464
3465 req->opaque = ctx;
3466
3467 qemu_iovec_init(&ctx->data.iov, 1);
3468 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
3469
3470 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
3471 BLOCK_ACCT_READ);
3472 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
3473 nvme_compare_data_cb, req);
3474
3475 return NVME_NO_COMPLETE;
3476 }
3477
3478 typedef struct NvmeFlushAIOCB {
3479 BlockAIOCB common;
3480 BlockAIOCB *aiocb;
3481 NvmeRequest *req;
3482 int ret;
3483
3484 NvmeNamespace *ns;
3485 uint32_t nsid;
3486 bool broadcast;
3487 } NvmeFlushAIOCB;
3488
nvme_flush_cancel(BlockAIOCB * acb)3489 static void nvme_flush_cancel(BlockAIOCB *acb)
3490 {
3491 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
3492
3493 iocb->ret = -ECANCELED;
3494
3495 if (iocb->aiocb) {
3496 blk_aio_cancel_async(iocb->aiocb);
3497 iocb->aiocb = NULL;
3498 }
3499 }
3500
3501 static const AIOCBInfo nvme_flush_aiocb_info = {
3502 .aiocb_size = sizeof(NvmeFlushAIOCB),
3503 .cancel_async = nvme_flush_cancel,
3504 };
3505
3506 static void nvme_do_flush(NvmeFlushAIOCB *iocb);
3507
nvme_flush_ns_cb(void * opaque,int ret)3508 static void nvme_flush_ns_cb(void *opaque, int ret)
3509 {
3510 NvmeFlushAIOCB *iocb = opaque;
3511 NvmeNamespace *ns = iocb->ns;
3512
3513 if (ret < 0) {
3514 iocb->ret = ret;
3515 iocb->req->status = NVME_WRITE_FAULT;
3516 goto out;
3517 } else if (iocb->ret < 0) {
3518 goto out;
3519 }
3520
3521 if (ns) {
3522 trace_pci_nvme_flush_ns(iocb->nsid);
3523
3524 iocb->ns = NULL;
3525 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
3526 return;
3527 }
3528
3529 out:
3530 nvme_do_flush(iocb);
3531 }
3532
nvme_do_flush(NvmeFlushAIOCB * iocb)3533 static void nvme_do_flush(NvmeFlushAIOCB *iocb)
3534 {
3535 NvmeRequest *req = iocb->req;
3536 NvmeCtrl *n = nvme_ctrl(req);
3537 int i;
3538
3539 if (iocb->ret < 0) {
3540 goto done;
3541 }
3542
3543 if (iocb->broadcast) {
3544 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
3545 iocb->ns = nvme_ns(n, i);
3546 if (iocb->ns) {
3547 iocb->nsid = i;
3548 break;
3549 }
3550 }
3551 }
3552
3553 if (!iocb->ns) {
3554 goto done;
3555 }
3556
3557 nvme_flush_ns_cb(iocb, 0);
3558 return;
3559
3560 done:
3561 iocb->common.cb(iocb->common.opaque, iocb->ret);
3562 qemu_aio_unref(iocb);
3563 }
3564
nvme_flush(NvmeCtrl * n,NvmeRequest * req)3565 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3566 {
3567 NvmeFlushAIOCB *iocb;
3568 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3569 uint16_t status;
3570
3571 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3572
3573 iocb->req = req;
3574 iocb->ret = 0;
3575 iocb->ns = NULL;
3576 iocb->nsid = 0;
3577 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3578
3579 if (!iocb->broadcast) {
3580 if (!nvme_nsid_valid(n, nsid)) {
3581 status = NVME_INVALID_NSID | NVME_DNR;
3582 goto out;
3583 }
3584
3585 iocb->ns = nvme_ns(n, nsid);
3586 if (!iocb->ns) {
3587 status = NVME_INVALID_FIELD | NVME_DNR;
3588 goto out;
3589 }
3590
3591 iocb->nsid = nsid;
3592 }
3593
3594 req->aiocb = &iocb->common;
3595 nvme_do_flush(iocb);
3596
3597 return NVME_NO_COMPLETE;
3598
3599 out:
3600 qemu_aio_unref(iocb);
3601
3602 return status;
3603 }
3604
nvme_read(NvmeCtrl * n,NvmeRequest * req)3605 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3606 {
3607 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3608 NvmeNamespace *ns = req->ns;
3609 uint64_t slba = le64_to_cpu(rw->slba);
3610 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3611 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3612 uint64_t data_size = nvme_l2b(ns, nlb);
3613 uint64_t mapped_size = data_size;
3614 uint64_t data_offset;
3615 BlockBackend *blk = ns->blkconf.blk;
3616 uint16_t status;
3617
3618 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
3619 mapped_size += nvme_m2b(ns, nlb);
3620
3621 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3622 bool pract = prinfo & NVME_PRINFO_PRACT;
3623
3624 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3625 mapped_size = data_size;
3626 }
3627 }
3628 }
3629
3630 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3631
3632 status = nvme_check_mdts(n, mapped_size);
3633 if (status) {
3634 goto invalid;
3635 }
3636
3637 status = nvme_check_bounds(ns, slba, nlb);
3638 if (status) {
3639 goto invalid;
3640 }
3641
3642 if (ns->params.zoned) {
3643 status = nvme_check_zone_read(ns, slba, nlb);
3644 if (status) {
3645 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3646 goto invalid;
3647 }
3648 }
3649
3650 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3651 status = nvme_check_dulbe(ns, slba, nlb);
3652 if (status) {
3653 goto invalid;
3654 }
3655 }
3656
3657 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3658 return nvme_dif_rw(n, req);
3659 }
3660
3661 status = nvme_map_data(n, nlb, req);
3662 if (status) {
3663 goto invalid;
3664 }
3665
3666 data_offset = nvme_l2b(ns, slba);
3667
3668 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3669 BLOCK_ACCT_READ);
3670 nvme_blk_read(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3671 return NVME_NO_COMPLETE;
3672
3673 invalid:
3674 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3675 return status | NVME_DNR;
3676 }
3677
nvme_do_write_fdp(NvmeCtrl * n,NvmeRequest * req,uint64_t slba,uint32_t nlb)3678 static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba,
3679 uint32_t nlb)
3680 {
3681 NvmeNamespace *ns = req->ns;
3682 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3683 uint64_t data_size = nvme_l2b(ns, nlb);
3684 uint32_t dw12 = le32_to_cpu(req->cmd.cdw12);
3685 uint8_t dtype = (dw12 >> 20) & 0xf;
3686 uint16_t pid = le16_to_cpu(rw->dspec);
3687 uint16_t ph, rg, ruhid;
3688 NvmeReclaimUnit *ru;
3689
3690 if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT ||
3691 !nvme_parse_pid(ns, pid, &ph, &rg)) {
3692 ph = 0;
3693 rg = 0;
3694 }
3695
3696 ruhid = ns->fdp.phs[ph];
3697 ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg];
3698
3699 nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size);
3700 nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size);
3701
3702 while (nlb) {
3703 if (nlb < ru->ruamw) {
3704 ru->ruamw -= nlb;
3705 break;
3706 }
3707
3708 nlb -= ru->ruamw;
3709 nvme_update_ruh(n, ns, pid);
3710 }
3711 }
3712
nvme_do_write(NvmeCtrl * n,NvmeRequest * req,bool append,bool wrz)3713 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3714 bool wrz)
3715 {
3716 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3717 NvmeNamespace *ns = req->ns;
3718 uint64_t slba = le64_to_cpu(rw->slba);
3719 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3720 uint16_t ctrl = le16_to_cpu(rw->control);
3721 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3722 uint64_t data_size = nvme_l2b(ns, nlb);
3723 uint64_t mapped_size = data_size;
3724 uint64_t data_offset;
3725 NvmeZone *zone;
3726 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3727 BlockBackend *blk = ns->blkconf.blk;
3728 uint16_t status;
3729
3730 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
3731 mapped_size += nvme_m2b(ns, nlb);
3732
3733 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3734 bool pract = prinfo & NVME_PRINFO_PRACT;
3735
3736 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3737 mapped_size -= nvme_m2b(ns, nlb);
3738 }
3739 }
3740 }
3741
3742 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3743 nvme_nsid(ns), nlb, mapped_size, slba);
3744
3745 if (!wrz) {
3746 status = nvme_check_mdts(n, mapped_size);
3747 if (status) {
3748 goto invalid;
3749 }
3750 }
3751
3752 status = nvme_check_bounds(ns, slba, nlb);
3753 if (status) {
3754 goto invalid;
3755 }
3756
3757 if (ns->params.zoned) {
3758 zone = nvme_get_zone_by_slba(ns, slba);
3759 assert(zone);
3760
3761 if (append) {
3762 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3763
3764 if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3765 return NVME_INVALID_ZONE_OP | NVME_DNR;
3766 }
3767
3768 if (unlikely(slba != zone->d.zslba)) {
3769 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3770 status = NVME_INVALID_FIELD;
3771 goto invalid;
3772 }
3773
3774 if (n->params.zasl &&
3775 data_size > (uint64_t)n->page_size << n->params.zasl) {
3776 trace_pci_nvme_err_zasl(data_size);
3777 return NVME_INVALID_FIELD | NVME_DNR;
3778 }
3779
3780 slba = zone->w_ptr;
3781 rw->slba = cpu_to_le64(slba);
3782 res->slba = cpu_to_le64(slba);
3783
3784 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3785 case NVME_ID_NS_DPS_TYPE_1:
3786 if (!piremap) {
3787 return NVME_INVALID_PROT_INFO | NVME_DNR;
3788 }
3789
3790 /* fallthrough */
3791
3792 case NVME_ID_NS_DPS_TYPE_2:
3793 if (piremap) {
3794 uint32_t reftag = le32_to_cpu(rw->reftag);
3795 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3796 }
3797
3798 break;
3799
3800 case NVME_ID_NS_DPS_TYPE_3:
3801 if (piremap) {
3802 return NVME_INVALID_PROT_INFO | NVME_DNR;
3803 }
3804
3805 break;
3806 }
3807 }
3808
3809 status = nvme_check_zone_write(ns, zone, slba, nlb);
3810 if (status) {
3811 goto invalid;
3812 }
3813
3814 status = nvme_zrm_auto(n, ns, zone);
3815 if (status) {
3816 goto invalid;
3817 }
3818
3819 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3820 zone->w_ptr += nlb;
3821 }
3822 } else if (ns->endgrp && ns->endgrp->fdp.enabled) {
3823 nvme_do_write_fdp(n, req, slba, nlb);
3824 }
3825
3826 data_offset = nvme_l2b(ns, slba);
3827
3828 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3829 return nvme_dif_rw(n, req);
3830 }
3831
3832 if (!wrz) {
3833 status = nvme_map_data(n, nlb, req);
3834 if (status) {
3835 goto invalid;
3836 }
3837
3838 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3839 BLOCK_ACCT_WRITE);
3840 nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3841 } else {
3842 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3843 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3844 req);
3845 }
3846
3847 return NVME_NO_COMPLETE;
3848
3849 invalid:
3850 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3851 return status | NVME_DNR;
3852 }
3853
nvme_write(NvmeCtrl * n,NvmeRequest * req)3854 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3855 {
3856 return nvme_do_write(n, req, false, false);
3857 }
3858
nvme_write_zeroes(NvmeCtrl * n,NvmeRequest * req)3859 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3860 {
3861 return nvme_do_write(n, req, false, true);
3862 }
3863
nvme_zone_append(NvmeCtrl * n,NvmeRequest * req)3864 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3865 {
3866 return nvme_do_write(n, req, true, false);
3867 }
3868
nvme_get_mgmt_zone_slba_idx(NvmeNamespace * ns,NvmeCmd * c,uint64_t * slba,uint32_t * zone_idx)3869 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3870 uint64_t *slba, uint32_t *zone_idx)
3871 {
3872 uint32_t dw10 = le32_to_cpu(c->cdw10);
3873 uint32_t dw11 = le32_to_cpu(c->cdw11);
3874
3875 if (!ns->params.zoned) {
3876 trace_pci_nvme_err_invalid_opc(c->opcode);
3877 return NVME_INVALID_OPCODE | NVME_DNR;
3878 }
3879
3880 *slba = ((uint64_t)dw11) << 32 | dw10;
3881 if (unlikely(*slba >= ns->id_ns.nsze)) {
3882 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3883 *slba = 0;
3884 return NVME_LBA_RANGE | NVME_DNR;
3885 }
3886
3887 *zone_idx = nvme_zone_idx(ns, *slba);
3888 assert(*zone_idx < ns->num_zones);
3889
3890 return NVME_SUCCESS;
3891 }
3892
3893 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3894 NvmeRequest *);
3895
3896 enum NvmeZoneProcessingMask {
3897 NVME_PROC_CURRENT_ZONE = 0,
3898 NVME_PROC_OPENED_ZONES = 1 << 0,
3899 NVME_PROC_CLOSED_ZONES = 1 << 1,
3900 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3901 NVME_PROC_FULL_ZONES = 1 << 3,
3902 };
3903
nvme_open_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3904 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3905 NvmeZoneState state, NvmeRequest *req)
3906 {
3907 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3908 int flags = 0;
3909
3910 if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) {
3911 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3912
3913 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3914 return NVME_INVALID_ZONE_OP | NVME_DNR;
3915 }
3916
3917 if (zone->w_ptr % ns->zns.zrwafg) {
3918 return NVME_NOZRWA | NVME_DNR;
3919 }
3920
3921 flags = NVME_ZRM_ZRWA;
3922 }
3923
3924 return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags);
3925 }
3926
nvme_close_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3927 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3928 NvmeZoneState state, NvmeRequest *req)
3929 {
3930 return nvme_zrm_close(ns, zone);
3931 }
3932
nvme_finish_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3933 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3934 NvmeZoneState state, NvmeRequest *req)
3935 {
3936 return nvme_zrm_finish(ns, zone);
3937 }
3938
nvme_offline_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3939 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3940 NvmeZoneState state, NvmeRequest *req)
3941 {
3942 switch (state) {
3943 case NVME_ZONE_STATE_READ_ONLY:
3944 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3945 /* fall through */
3946 case NVME_ZONE_STATE_OFFLINE:
3947 return NVME_SUCCESS;
3948 default:
3949 return NVME_ZONE_INVAL_TRANSITION;
3950 }
3951 }
3952
nvme_set_zd_ext(NvmeNamespace * ns,NvmeZone * zone)3953 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3954 {
3955 uint16_t status;
3956 uint8_t state = nvme_get_zone_state(zone);
3957
3958 if (state == NVME_ZONE_STATE_EMPTY) {
3959 status = nvme_aor_check(ns, 1, 0);
3960 if (status) {
3961 return status;
3962 }
3963 nvme_aor_inc_active(ns);
3964 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3965 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3966 return NVME_SUCCESS;
3967 }
3968
3969 return NVME_ZONE_INVAL_TRANSITION;
3970 }
3971
nvme_bulk_proc_zone(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)3972 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3973 enum NvmeZoneProcessingMask proc_mask,
3974 op_handler_t op_hndlr, NvmeRequest *req)
3975 {
3976 uint16_t status = NVME_SUCCESS;
3977 NvmeZoneState zs = nvme_get_zone_state(zone);
3978 bool proc_zone;
3979
3980 switch (zs) {
3981 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3982 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3983 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3984 break;
3985 case NVME_ZONE_STATE_CLOSED:
3986 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3987 break;
3988 case NVME_ZONE_STATE_READ_ONLY:
3989 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3990 break;
3991 case NVME_ZONE_STATE_FULL:
3992 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3993 break;
3994 default:
3995 proc_zone = false;
3996 }
3997
3998 if (proc_zone) {
3999 status = op_hndlr(ns, zone, zs, req);
4000 }
4001
4002 return status;
4003 }
4004
nvme_do_zone_op(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)4005 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
4006 enum NvmeZoneProcessingMask proc_mask,
4007 op_handler_t op_hndlr, NvmeRequest *req)
4008 {
4009 NvmeZone *next;
4010 uint16_t status = NVME_SUCCESS;
4011 int i;
4012
4013 if (!proc_mask) {
4014 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
4015 } else {
4016 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
4017 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
4018 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4019 req);
4020 if (status && status != NVME_NO_COMPLETE) {
4021 goto out;
4022 }
4023 }
4024 }
4025 if (proc_mask & NVME_PROC_OPENED_ZONES) {
4026 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
4027 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4028 req);
4029 if (status && status != NVME_NO_COMPLETE) {
4030 goto out;
4031 }
4032 }
4033
4034 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
4035 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4036 req);
4037 if (status && status != NVME_NO_COMPLETE) {
4038 goto out;
4039 }
4040 }
4041 }
4042 if (proc_mask & NVME_PROC_FULL_ZONES) {
4043 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
4044 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4045 req);
4046 if (status && status != NVME_NO_COMPLETE) {
4047 goto out;
4048 }
4049 }
4050 }
4051
4052 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
4053 for (i = 0; i < ns->num_zones; i++, zone++) {
4054 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4055 req);
4056 if (status && status != NVME_NO_COMPLETE) {
4057 goto out;
4058 }
4059 }
4060 }
4061 }
4062
4063 out:
4064 return status;
4065 }
4066
4067 typedef struct NvmeZoneResetAIOCB {
4068 BlockAIOCB common;
4069 BlockAIOCB *aiocb;
4070 NvmeRequest *req;
4071 int ret;
4072
4073 bool all;
4074 int idx;
4075 NvmeZone *zone;
4076 } NvmeZoneResetAIOCB;
4077
nvme_zone_reset_cancel(BlockAIOCB * aiocb)4078 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
4079 {
4080 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
4081 NvmeRequest *req = iocb->req;
4082 NvmeNamespace *ns = req->ns;
4083
4084 iocb->idx = ns->num_zones;
4085
4086 iocb->ret = -ECANCELED;
4087
4088 if (iocb->aiocb) {
4089 blk_aio_cancel_async(iocb->aiocb);
4090 iocb->aiocb = NULL;
4091 }
4092 }
4093
4094 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
4095 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
4096 .cancel_async = nvme_zone_reset_cancel,
4097 };
4098
4099 static void nvme_zone_reset_cb(void *opaque, int ret);
4100
nvme_zone_reset_epilogue_cb(void * opaque,int ret)4101 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
4102 {
4103 NvmeZoneResetAIOCB *iocb = opaque;
4104 NvmeRequest *req = iocb->req;
4105 NvmeNamespace *ns = req->ns;
4106 int64_t moff;
4107 int count;
4108
4109 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
4110 goto out;
4111 }
4112
4113 moff = nvme_moff(ns, iocb->zone->d.zslba);
4114 count = nvme_m2b(ns, ns->zone_size);
4115
4116 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
4117 BDRV_REQ_MAY_UNMAP,
4118 nvme_zone_reset_cb, iocb);
4119 return;
4120
4121 out:
4122 nvme_zone_reset_cb(iocb, ret);
4123 }
4124
nvme_zone_reset_cb(void * opaque,int ret)4125 static void nvme_zone_reset_cb(void *opaque, int ret)
4126 {
4127 NvmeZoneResetAIOCB *iocb = opaque;
4128 NvmeRequest *req = iocb->req;
4129 NvmeNamespace *ns = req->ns;
4130
4131 if (iocb->ret < 0) {
4132 goto done;
4133 } else if (ret < 0) {
4134 iocb->ret = ret;
4135 goto done;
4136 }
4137
4138 if (iocb->zone) {
4139 nvme_zrm_reset(ns, iocb->zone);
4140
4141 if (!iocb->all) {
4142 goto done;
4143 }
4144 }
4145
4146 while (iocb->idx < ns->num_zones) {
4147 NvmeZone *zone = &ns->zone_array[iocb->idx++];
4148
4149 switch (nvme_get_zone_state(zone)) {
4150 case NVME_ZONE_STATE_EMPTY:
4151 if (!iocb->all) {
4152 goto done;
4153 }
4154
4155 continue;
4156
4157 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
4158 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
4159 case NVME_ZONE_STATE_CLOSED:
4160 case NVME_ZONE_STATE_FULL:
4161 iocb->zone = zone;
4162 break;
4163
4164 default:
4165 continue;
4166 }
4167
4168 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
4169
4170 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
4171 nvme_l2b(ns, zone->d.zslba),
4172 nvme_l2b(ns, ns->zone_size),
4173 BDRV_REQ_MAY_UNMAP,
4174 nvme_zone_reset_epilogue_cb,
4175 iocb);
4176 return;
4177 }
4178
4179 done:
4180 iocb->aiocb = NULL;
4181
4182 iocb->common.cb(iocb->common.opaque, iocb->ret);
4183 qemu_aio_unref(iocb);
4184 }
4185
nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl * n,NvmeZone * zone,uint64_t elba,NvmeRequest * req)4186 static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone,
4187 uint64_t elba, NvmeRequest *req)
4188 {
4189 NvmeNamespace *ns = req->ns;
4190 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
4191 uint64_t wp = zone->d.wp;
4192 uint32_t nlb = elba - wp + 1;
4193 uint16_t status;
4194
4195
4196 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
4197 return NVME_INVALID_ZONE_OP | NVME_DNR;
4198 }
4199
4200 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
4201 return NVME_INVALID_FIELD | NVME_DNR;
4202 }
4203
4204 if (elba < wp || elba > wp + ns->zns.zrwas) {
4205 return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
4206 }
4207
4208 if (nlb % ns->zns.zrwafg) {
4209 return NVME_INVALID_FIELD | NVME_DNR;
4210 }
4211
4212 status = nvme_zrm_auto(n, ns, zone);
4213 if (status) {
4214 return status;
4215 }
4216
4217 zone->w_ptr += nlb;
4218
4219 nvme_advance_zone_wp(ns, zone, nlb);
4220
4221 return NVME_SUCCESS;
4222 }
4223
nvme_zone_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4224 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4225 {
4226 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
4227 NvmeNamespace *ns = req->ns;
4228 NvmeZone *zone;
4229 NvmeZoneResetAIOCB *iocb;
4230 uint8_t *zd_ext;
4231 uint64_t slba = 0;
4232 uint32_t zone_idx = 0;
4233 uint16_t status;
4234 uint8_t action = cmd->zsa;
4235 bool all;
4236 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
4237
4238 all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
4239
4240 req->status = NVME_SUCCESS;
4241
4242 if (!all) {
4243 status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
4244 if (status) {
4245 return status;
4246 }
4247 }
4248
4249 zone = &ns->zone_array[zone_idx];
4250 if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) {
4251 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
4252 return NVME_INVALID_FIELD | NVME_DNR;
4253 }
4254
4255 switch (action) {
4256
4257 case NVME_ZONE_ACTION_OPEN:
4258 if (all) {
4259 proc_mask = NVME_PROC_CLOSED_ZONES;
4260 }
4261 trace_pci_nvme_open_zone(slba, zone_idx, all);
4262 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
4263 break;
4264
4265 case NVME_ZONE_ACTION_CLOSE:
4266 if (all) {
4267 proc_mask = NVME_PROC_OPENED_ZONES;
4268 }
4269 trace_pci_nvme_close_zone(slba, zone_idx, all);
4270 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
4271 break;
4272
4273 case NVME_ZONE_ACTION_FINISH:
4274 if (all) {
4275 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
4276 }
4277 trace_pci_nvme_finish_zone(slba, zone_idx, all);
4278 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
4279 break;
4280
4281 case NVME_ZONE_ACTION_RESET:
4282 trace_pci_nvme_reset_zone(slba, zone_idx, all);
4283
4284 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
4285 nvme_misc_cb, req);
4286
4287 iocb->req = req;
4288 iocb->ret = 0;
4289 iocb->all = all;
4290 iocb->idx = zone_idx;
4291 iocb->zone = NULL;
4292
4293 req->aiocb = &iocb->common;
4294 nvme_zone_reset_cb(iocb, 0);
4295
4296 return NVME_NO_COMPLETE;
4297
4298 case NVME_ZONE_ACTION_OFFLINE:
4299 if (all) {
4300 proc_mask = NVME_PROC_READ_ONLY_ZONES;
4301 }
4302 trace_pci_nvme_offline_zone(slba, zone_idx, all);
4303 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
4304 break;
4305
4306 case NVME_ZONE_ACTION_SET_ZD_EXT:
4307 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
4308 if (all || !ns->params.zd_extension_size) {
4309 return NVME_INVALID_FIELD | NVME_DNR;
4310 }
4311 zd_ext = nvme_get_zd_extension(ns, zone_idx);
4312 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
4313 if (status) {
4314 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
4315 return status;
4316 }
4317
4318 status = nvme_set_zd_ext(ns, zone);
4319 if (status == NVME_SUCCESS) {
4320 trace_pci_nvme_zd_extension_set(zone_idx);
4321 return status;
4322 }
4323 break;
4324
4325 case NVME_ZONE_ACTION_ZRWA_FLUSH:
4326 if (all) {
4327 return NVME_INVALID_FIELD | NVME_DNR;
4328 }
4329
4330 return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req);
4331
4332 default:
4333 trace_pci_nvme_err_invalid_mgmt_action(action);
4334 status = NVME_INVALID_FIELD;
4335 }
4336
4337 if (status == NVME_ZONE_INVAL_TRANSITION) {
4338 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
4339 zone->d.za);
4340 }
4341 if (status) {
4342 status |= NVME_DNR;
4343 }
4344
4345 return status;
4346 }
4347
nvme_zone_matches_filter(uint32_t zafs,NvmeZone * zl)4348 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
4349 {
4350 NvmeZoneState zs = nvme_get_zone_state(zl);
4351
4352 switch (zafs) {
4353 case NVME_ZONE_REPORT_ALL:
4354 return true;
4355 case NVME_ZONE_REPORT_EMPTY:
4356 return zs == NVME_ZONE_STATE_EMPTY;
4357 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
4358 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
4359 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
4360 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
4361 case NVME_ZONE_REPORT_CLOSED:
4362 return zs == NVME_ZONE_STATE_CLOSED;
4363 case NVME_ZONE_REPORT_FULL:
4364 return zs == NVME_ZONE_STATE_FULL;
4365 case NVME_ZONE_REPORT_READ_ONLY:
4366 return zs == NVME_ZONE_STATE_READ_ONLY;
4367 case NVME_ZONE_REPORT_OFFLINE:
4368 return zs == NVME_ZONE_STATE_OFFLINE;
4369 default:
4370 return false;
4371 }
4372 }
4373
nvme_zone_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4374 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4375 {
4376 NvmeCmd *cmd = &req->cmd;
4377 NvmeNamespace *ns = req->ns;
4378 /* cdw12 is zero-based number of dwords to return. Convert to bytes */
4379 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
4380 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4381 uint32_t zone_idx, zra, zrasf, partial;
4382 uint64_t max_zones, nr_zones = 0;
4383 uint16_t status;
4384 uint64_t slba;
4385 NvmeZoneDescr *z;
4386 NvmeZone *zone;
4387 NvmeZoneReportHeader *header;
4388 void *buf, *buf_p;
4389 size_t zone_entry_sz;
4390 int i;
4391
4392 req->status = NVME_SUCCESS;
4393
4394 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
4395 if (status) {
4396 return status;
4397 }
4398
4399 zra = dw13 & 0xff;
4400 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
4401 return NVME_INVALID_FIELD | NVME_DNR;
4402 }
4403 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
4404 return NVME_INVALID_FIELD | NVME_DNR;
4405 }
4406
4407 zrasf = (dw13 >> 8) & 0xff;
4408 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
4409 return NVME_INVALID_FIELD | NVME_DNR;
4410 }
4411
4412 if (data_size < sizeof(NvmeZoneReportHeader)) {
4413 return NVME_INVALID_FIELD | NVME_DNR;
4414 }
4415
4416 status = nvme_check_mdts(n, data_size);
4417 if (status) {
4418 return status;
4419 }
4420
4421 partial = (dw13 >> 16) & 0x01;
4422
4423 zone_entry_sz = sizeof(NvmeZoneDescr);
4424 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4425 zone_entry_sz += ns->params.zd_extension_size;
4426 }
4427
4428 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
4429 buf = g_malloc0(data_size);
4430
4431 zone = &ns->zone_array[zone_idx];
4432 for (i = zone_idx; i < ns->num_zones; i++) {
4433 if (partial && nr_zones >= max_zones) {
4434 break;
4435 }
4436 if (nvme_zone_matches_filter(zrasf, zone++)) {
4437 nr_zones++;
4438 }
4439 }
4440 header = buf;
4441 header->nr_zones = cpu_to_le64(nr_zones);
4442
4443 buf_p = buf + sizeof(NvmeZoneReportHeader);
4444 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
4445 zone = &ns->zone_array[zone_idx];
4446 if (nvme_zone_matches_filter(zrasf, zone)) {
4447 z = buf_p;
4448 buf_p += sizeof(NvmeZoneDescr);
4449
4450 z->zt = zone->d.zt;
4451 z->zs = zone->d.zs;
4452 z->zcap = cpu_to_le64(zone->d.zcap);
4453 z->zslba = cpu_to_le64(zone->d.zslba);
4454 z->za = zone->d.za;
4455
4456 if (nvme_wp_is_valid(zone)) {
4457 z->wp = cpu_to_le64(zone->d.wp);
4458 } else {
4459 z->wp = cpu_to_le64(~0ULL);
4460 }
4461
4462 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4463 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
4464 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
4465 ns->params.zd_extension_size);
4466 }
4467 buf_p += ns->params.zd_extension_size;
4468 }
4469
4470 max_zones--;
4471 }
4472 }
4473
4474 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
4475
4476 g_free(buf);
4477
4478 return status;
4479 }
4480
nvme_io_mgmt_recv_ruhs(NvmeCtrl * n,NvmeRequest * req,size_t len)4481 static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
4482 size_t len)
4483 {
4484 NvmeNamespace *ns = req->ns;
4485 NvmeEnduranceGroup *endgrp;
4486 NvmeRuhStatus *hdr;
4487 NvmeRuhStatusDescr *ruhsd;
4488 unsigned int nruhsd;
4489 uint16_t rg, ph, *ruhid;
4490 size_t trans_len;
4491 g_autofree uint8_t *buf = NULL;
4492
4493 if (!n->subsys) {
4494 return NVME_INVALID_FIELD | NVME_DNR;
4495 }
4496
4497 if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) {
4498 return NVME_INVALID_NSID | NVME_DNR;
4499 }
4500
4501 if (!n->subsys->endgrp.fdp.enabled) {
4502 return NVME_FDP_DISABLED | NVME_DNR;
4503 }
4504
4505 endgrp = ns->endgrp;
4506
4507 nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
4508 trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
4509 buf = g_malloc0(trans_len);
4510
4511 trans_len = MIN(trans_len, len);
4512
4513 hdr = (NvmeRuhStatus *)buf;
4514 ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus));
4515
4516 hdr->nruhsd = cpu_to_le16(nruhsd);
4517
4518 ruhid = ns->fdp.phs;
4519
4520 for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) {
4521 NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid];
4522
4523 for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) {
4524 uint16_t pid = nvme_make_pid(ns, rg, ph);
4525
4526 ruhsd->pid = cpu_to_le16(pid);
4527 ruhsd->ruhid = *ruhid;
4528 ruhsd->earutr = 0;
4529 ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw);
4530 }
4531 }
4532
4533 return nvme_c2h(n, buf, trans_len, req);
4534 }
4535
nvme_io_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4536 static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4537 {
4538 NvmeCmd *cmd = &req->cmd;
4539 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4540 uint32_t numd = le32_to_cpu(cmd->cdw11);
4541 uint8_t mo = (cdw10 & 0xff);
4542 size_t len = (numd + 1) << 2;
4543
4544 switch (mo) {
4545 case NVME_IOMR_MO_NOP:
4546 return 0;
4547 case NVME_IOMR_MO_RUH_STATUS:
4548 return nvme_io_mgmt_recv_ruhs(n, req, len);
4549 default:
4550 return NVME_INVALID_FIELD | NVME_DNR;
4551 };
4552 }
4553
nvme_io_mgmt_send_ruh_update(NvmeCtrl * n,NvmeRequest * req)4554 static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
4555 {
4556 NvmeCmd *cmd = &req->cmd;
4557 NvmeNamespace *ns = req->ns;
4558 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4559 uint16_t ret = NVME_SUCCESS;
4560 uint32_t npid = (cdw10 >> 16) + 1;
4561 unsigned int i = 0;
4562 g_autofree uint16_t *pids = NULL;
4563 uint32_t maxnpid;
4564
4565 if (!ns->endgrp || !ns->endgrp->fdp.enabled) {
4566 return NVME_FDP_DISABLED | NVME_DNR;
4567 }
4568
4569 maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
4570
4571 if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
4572 return NVME_INVALID_FIELD | NVME_DNR;
4573 }
4574
4575 pids = g_new(uint16_t, npid);
4576
4577 ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req);
4578 if (ret) {
4579 return ret;
4580 }
4581
4582 for (; i < npid; i++) {
4583 if (!nvme_update_ruh(n, ns, pids[i])) {
4584 return NVME_INVALID_FIELD | NVME_DNR;
4585 }
4586 }
4587
4588 return ret;
4589 }
4590
nvme_io_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4591 static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4592 {
4593 NvmeCmd *cmd = &req->cmd;
4594 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4595 uint8_t mo = (cdw10 & 0xff);
4596
4597 switch (mo) {
4598 case NVME_IOMS_MO_NOP:
4599 return 0;
4600 case NVME_IOMS_MO_RUH_UPDATE:
4601 return nvme_io_mgmt_send_ruh_update(n, req);
4602 default:
4603 return NVME_INVALID_FIELD | NVME_DNR;
4604 };
4605 }
4606
__nvme_io_cmd_nvm(NvmeCtrl * n,NvmeRequest * req)4607 static uint16_t __nvme_io_cmd_nvm(NvmeCtrl *n, NvmeRequest *req)
4608 {
4609 switch (req->cmd.opcode) {
4610 case NVME_CMD_WRITE:
4611 return nvme_write(n, req);
4612 case NVME_CMD_READ:
4613 return nvme_read(n, req);
4614 case NVME_CMD_COMPARE:
4615 return nvme_compare(n, req);
4616 case NVME_CMD_WRITE_ZEROES:
4617 return nvme_write_zeroes(n, req);
4618 case NVME_CMD_DSM:
4619 return nvme_dsm(n, req);
4620 case NVME_CMD_VERIFY:
4621 return nvme_verify(n, req);
4622 case NVME_CMD_COPY:
4623 return nvme_copy(n, req);
4624 case NVME_CMD_IO_MGMT_RECV:
4625 return nvme_io_mgmt_recv(n, req);
4626 case NVME_CMD_IO_MGMT_SEND:
4627 return nvme_io_mgmt_send(n, req);
4628 }
4629
4630 g_assert_not_reached();
4631 }
4632
nvme_io_cmd_nvm(NvmeCtrl * n,NvmeRequest * req)4633 static uint16_t nvme_io_cmd_nvm(NvmeCtrl *n, NvmeRequest *req)
4634 {
4635 if (!(n->cse.iocs.nvm[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4636 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4637 return NVME_INVALID_OPCODE | NVME_DNR;
4638 }
4639
4640 return __nvme_io_cmd_nvm(n, req);
4641 }
4642
nvme_io_cmd_zoned(NvmeCtrl * n,NvmeRequest * req)4643 static uint16_t nvme_io_cmd_zoned(NvmeCtrl *n, NvmeRequest *req)
4644 {
4645 if (!(n->cse.iocs.zoned[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4646 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4647 return NVME_INVALID_OPCODE | NVME_DNR;
4648 }
4649
4650 switch (req->cmd.opcode) {
4651 case NVME_CMD_ZONE_APPEND:
4652 return nvme_zone_append(n, req);
4653 case NVME_CMD_ZONE_MGMT_SEND:
4654 return nvme_zone_mgmt_send(n, req);
4655 case NVME_CMD_ZONE_MGMT_RECV:
4656 return nvme_zone_mgmt_recv(n, req);
4657 }
4658
4659 return __nvme_io_cmd_nvm(n, req);
4660 }
4661
nvme_io_cmd(NvmeCtrl * n,NvmeRequest * req)4662 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
4663 {
4664 NvmeNamespace *ns;
4665 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4666
4667 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
4668 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
4669
4670 /*
4671 * In the base NVM command set, Flush may apply to all namespaces
4672 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
4673 * along with TP 4056 (Namespace Types), it may be pretty screwed up.
4674 *
4675 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
4676 * opcode with a specific command since we cannot determine a unique I/O
4677 * command set. Opcode 0h could have any other meaning than something
4678 * equivalent to flushing and say it DOES have completely different
4679 * semantics in some other command set - does an NSID of FFFFFFFFh then
4680 * mean "for all namespaces, apply whatever command set specific command
4681 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
4682 * whatever command that uses the 0h opcode if, and only if, it allows NSID
4683 * to be FFFFFFFFh"?
4684 *
4685 * Anyway (and luckily), for now, we do not care about this since the
4686 * device only supports namespace types that includes the NVM Flush command
4687 * (NVM and Zoned), so always do an NVM Flush.
4688 */
4689
4690 if (req->cmd.opcode == NVME_CMD_FLUSH) {
4691 return nvme_flush(n, req);
4692 }
4693
4694 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4695 return NVME_INVALID_NSID | NVME_DNR;
4696 }
4697
4698 ns = nvme_ns(n, nsid);
4699 if (unlikely(!ns)) {
4700 return NVME_INVALID_FIELD | NVME_DNR;
4701 }
4702
4703 if (ns->status) {
4704 return ns->status;
4705 }
4706
4707 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
4708 return NVME_INVALID_FIELD;
4709 }
4710
4711 req->ns = ns;
4712
4713 switch (ns->csi) {
4714 case NVME_CSI_NVM:
4715 return nvme_io_cmd_nvm(n, req);
4716 case NVME_CSI_ZONED:
4717 return nvme_io_cmd_zoned(n, req);
4718 }
4719
4720 g_assert_not_reached();
4721 }
4722
nvme_cq_notifier(EventNotifier * e)4723 static void nvme_cq_notifier(EventNotifier *e)
4724 {
4725 NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
4726 NvmeCtrl *n = cq->ctrl;
4727
4728 if (!event_notifier_test_and_clear(e)) {
4729 return;
4730 }
4731
4732 nvme_update_cq_head(cq);
4733
4734 if (cq->tail == cq->head) {
4735 if (cq->irq_enabled) {
4736 n->cq_pending--;
4737 }
4738
4739 nvme_irq_deassert(n, cq);
4740 }
4741
4742 qemu_bh_schedule(cq->bh);
4743 }
4744
nvme_init_cq_ioeventfd(NvmeCQueue * cq)4745 static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
4746 {
4747 NvmeCtrl *n = cq->ctrl;
4748 uint16_t offset = (cq->cqid << 3) + (1 << 2);
4749 int ret;
4750
4751 ret = event_notifier_init(&cq->notifier, 0);
4752 if (ret < 0) {
4753 return ret;
4754 }
4755
4756 event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
4757 memory_region_add_eventfd(&n->iomem,
4758 0x1000 + offset, 4, false, 0, &cq->notifier);
4759
4760 return 0;
4761 }
4762
nvme_sq_notifier(EventNotifier * e)4763 static void nvme_sq_notifier(EventNotifier *e)
4764 {
4765 NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
4766
4767 if (!event_notifier_test_and_clear(e)) {
4768 return;
4769 }
4770
4771 nvme_process_sq(sq);
4772 }
4773
nvme_init_sq_ioeventfd(NvmeSQueue * sq)4774 static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
4775 {
4776 NvmeCtrl *n = sq->ctrl;
4777 uint16_t offset = sq->sqid << 3;
4778 int ret;
4779
4780 ret = event_notifier_init(&sq->notifier, 0);
4781 if (ret < 0) {
4782 return ret;
4783 }
4784
4785 event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
4786 memory_region_add_eventfd(&n->iomem,
4787 0x1000 + offset, 4, false, 0, &sq->notifier);
4788
4789 return 0;
4790 }
4791
nvme_free_sq(NvmeSQueue * sq,NvmeCtrl * n)4792 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
4793 {
4794 uint16_t offset = sq->sqid << 3;
4795
4796 n->sq[sq->sqid] = NULL;
4797 qemu_bh_delete(sq->bh);
4798 if (sq->ioeventfd_enabled) {
4799 memory_region_del_eventfd(&n->iomem,
4800 0x1000 + offset, 4, false, 0, &sq->notifier);
4801 event_notifier_set_handler(&sq->notifier, NULL);
4802 event_notifier_cleanup(&sq->notifier);
4803 }
4804 g_free(sq->io_req);
4805 if (sq->sqid) {
4806 g_free(sq);
4807 }
4808 }
4809
nvme_del_sq(NvmeCtrl * n,NvmeRequest * req)4810 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
4811 {
4812 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4813 NvmeRequest *r, *next;
4814 NvmeSQueue *sq;
4815 NvmeCQueue *cq;
4816 uint16_t qid = le16_to_cpu(c->qid);
4817
4818 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
4819 trace_pci_nvme_err_invalid_del_sq(qid);
4820 return NVME_INVALID_QID | NVME_DNR;
4821 }
4822
4823 trace_pci_nvme_del_sq(qid);
4824
4825 sq = n->sq[qid];
4826 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
4827 r = QTAILQ_FIRST(&sq->out_req_list);
4828 assert(r->aiocb);
4829 r->status = NVME_CMD_ABORT_SQ_DEL;
4830 blk_aio_cancel(r->aiocb);
4831 }
4832
4833 assert(QTAILQ_EMPTY(&sq->out_req_list));
4834
4835 if (!nvme_check_cqid(n, sq->cqid)) {
4836 cq = n->cq[sq->cqid];
4837 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
4838
4839 nvme_post_cqes(cq);
4840 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
4841 if (r->sq == sq) {
4842 QTAILQ_REMOVE(&cq->req_list, r, entry);
4843 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
4844 }
4845 }
4846 }
4847
4848 nvme_free_sq(sq, n);
4849 return NVME_SUCCESS;
4850 }
4851
nvme_init_sq(NvmeSQueue * sq,NvmeCtrl * n,uint64_t dma_addr,uint16_t sqid,uint16_t cqid,uint16_t size)4852 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
4853 uint16_t sqid, uint16_t cqid, uint16_t size)
4854 {
4855 int i;
4856 NvmeCQueue *cq;
4857
4858 sq->ctrl = n;
4859 sq->dma_addr = dma_addr;
4860 sq->sqid = sqid;
4861 sq->size = size;
4862 sq->cqid = cqid;
4863 sq->head = sq->tail = 0;
4864 sq->io_req = g_new0(NvmeRequest, sq->size);
4865
4866 QTAILQ_INIT(&sq->req_list);
4867 QTAILQ_INIT(&sq->out_req_list);
4868 for (i = 0; i < sq->size; i++) {
4869 sq->io_req[i].sq = sq;
4870 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4871 }
4872
4873 sq->bh = qemu_bh_new_guarded(nvme_process_sq, sq,
4874 &DEVICE(sq->ctrl)->mem_reentrancy_guard);
4875
4876 if (n->dbbuf_enabled) {
4877 sq->db_addr = n->dbbuf_dbs + (sqid << 3);
4878 sq->ei_addr = n->dbbuf_eis + (sqid << 3);
4879
4880 if (n->params.ioeventfd && sq->sqid != 0) {
4881 if (!nvme_init_sq_ioeventfd(sq)) {
4882 sq->ioeventfd_enabled = true;
4883 }
4884 }
4885 }
4886
4887 assert(n->cq[cqid]);
4888 cq = n->cq[cqid];
4889 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4890 n->sq[sqid] = sq;
4891 }
4892
nvme_create_sq(NvmeCtrl * n,NvmeRequest * req)4893 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4894 {
4895 NvmeSQueue *sq;
4896 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4897
4898 uint16_t cqid = le16_to_cpu(c->cqid);
4899 uint16_t sqid = le16_to_cpu(c->sqid);
4900 uint16_t qsize = le16_to_cpu(c->qsize);
4901 uint16_t qflags = le16_to_cpu(c->sq_flags);
4902 uint64_t prp1 = le64_to_cpu(c->prp1);
4903
4904 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4905
4906 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4907 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4908 return NVME_INVALID_CQID | NVME_DNR;
4909 }
4910 if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
4911 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4912 return NVME_INVALID_QID | NVME_DNR;
4913 }
4914 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4915 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4916 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4917 }
4918 if (unlikely(prp1 & (n->page_size - 1))) {
4919 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4920 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4921 }
4922 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4923 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4924 return NVME_INVALID_FIELD | NVME_DNR;
4925 }
4926 sq = g_malloc0(sizeof(*sq));
4927 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4928 return NVME_SUCCESS;
4929 }
4930
4931 struct nvme_stats {
4932 uint64_t units_read;
4933 uint64_t units_written;
4934 uint64_t read_commands;
4935 uint64_t write_commands;
4936 };
4937
nvme_set_blk_stats(NvmeNamespace * ns,struct nvme_stats * stats)4938 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4939 {
4940 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4941
4942 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
4943 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
4944 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4945 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4946 }
4947
nvme_ocp_extended_smart_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4948 static uint16_t nvme_ocp_extended_smart_info(NvmeCtrl *n, uint8_t rae,
4949 uint32_t buf_len, uint64_t off,
4950 NvmeRequest *req)
4951 {
4952 NvmeNamespace *ns = NULL;
4953 NvmeSmartLogExtended smart_l = { 0 };
4954 struct nvme_stats stats = { 0 };
4955 uint32_t trans_len;
4956
4957 if (off >= sizeof(smart_l)) {
4958 return NVME_INVALID_FIELD | NVME_DNR;
4959 }
4960
4961 /* accumulate all stats from all namespaces */
4962 for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4963 ns = nvme_ns(n, i);
4964 if (ns) {
4965 nvme_set_blk_stats(ns, &stats);
4966 }
4967 }
4968
4969 smart_l.physical_media_units_written[0] = cpu_to_le64(stats.units_written);
4970 smart_l.physical_media_units_read[0] = cpu_to_le64(stats.units_read);
4971 smart_l.log_page_version = 0x0005;
4972
4973 static const uint8_t guid[16] = {
4974 0xC5, 0xAF, 0x10, 0x28, 0xEA, 0xBF, 0xF2, 0xA4,
4975 0x9C, 0x4F, 0x6F, 0x7C, 0xC9, 0x14, 0xD5, 0xAF
4976 };
4977 memcpy(smart_l.log_page_guid, guid, sizeof(smart_l.log_page_guid));
4978
4979 if (!rae) {
4980 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4981 }
4982
4983 trans_len = MIN(sizeof(smart_l) - off, buf_len);
4984 return nvme_c2h(n, (uint8_t *) &smart_l + off, trans_len, req);
4985 }
4986
nvme_smart_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4987 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4988 uint64_t off, NvmeRequest *req)
4989 {
4990 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4991 struct nvme_stats stats = { 0 };
4992 NvmeSmartLog smart = { 0 };
4993 uint32_t trans_len;
4994 NvmeNamespace *ns;
4995 time_t current_ms;
4996 uint64_t u_read, u_written;
4997
4998 if (off >= sizeof(smart)) {
4999 return NVME_INVALID_FIELD | NVME_DNR;
5000 }
5001
5002 if (nsid != 0xffffffff) {
5003 ns = nvme_ns(n, nsid);
5004 if (!ns) {
5005 return NVME_INVALID_NSID | NVME_DNR;
5006 }
5007 nvme_set_blk_stats(ns, &stats);
5008 } else {
5009 int i;
5010
5011 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5012 ns = nvme_ns(n, i);
5013 if (!ns) {
5014 continue;
5015 }
5016 nvme_set_blk_stats(ns, &stats);
5017 }
5018 }
5019
5020 trans_len = MIN(sizeof(smart) - off, buf_len);
5021 smart.critical_warning = n->smart_critical_warning;
5022
5023 u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
5024 u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
5025
5026 smart.data_units_read[0] = cpu_to_le64(u_read);
5027 smart.data_units_written[0] = cpu_to_le64(u_written);
5028 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
5029 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
5030
5031 smart.temperature = cpu_to_le16(n->temperature);
5032
5033 if ((n->temperature >= n->features.temp_thresh_hi) ||
5034 (n->temperature <= n->features.temp_thresh_low)) {
5035 smart.critical_warning |= NVME_SMART_TEMPERATURE;
5036 }
5037
5038 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5039 smart.power_on_hours[0] =
5040 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
5041
5042 if (!rae) {
5043 nvme_clear_events(n, NVME_AER_TYPE_SMART);
5044 }
5045
5046 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
5047 }
5048
nvme_endgrp_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5049 static uint16_t nvme_endgrp_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5050 uint64_t off, NvmeRequest *req)
5051 {
5052 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
5053 uint16_t endgrpid = (dw11 >> 16) & 0xffff;
5054 struct nvme_stats stats = {};
5055 NvmeEndGrpLog info = {};
5056 int i;
5057
5058 if (!n->subsys || endgrpid != 0x1) {
5059 return NVME_INVALID_FIELD | NVME_DNR;
5060 }
5061
5062 if (off >= sizeof(info)) {
5063 return NVME_INVALID_FIELD | NVME_DNR;
5064 }
5065
5066 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5067 NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
5068 if (!ns) {
5069 continue;
5070 }
5071
5072 nvme_set_blk_stats(ns, &stats);
5073 }
5074
5075 info.data_units_read[0] =
5076 cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000));
5077 info.data_units_written[0] =
5078 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
5079 info.media_units_written[0] =
5080 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
5081
5082 info.host_read_commands[0] = cpu_to_le64(stats.read_commands);
5083 info.host_write_commands[0] = cpu_to_le64(stats.write_commands);
5084
5085 buf_len = MIN(sizeof(info) - off, buf_len);
5086
5087 return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req);
5088 }
5089
5090
nvme_fw_log_info(NvmeCtrl * n,uint32_t buf_len,uint64_t off,NvmeRequest * req)5091 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
5092 NvmeRequest *req)
5093 {
5094 uint32_t trans_len;
5095 NvmeFwSlotInfoLog fw_log = {
5096 .afi = 0x1,
5097 };
5098
5099 if (off >= sizeof(fw_log)) {
5100 return NVME_INVALID_FIELD | NVME_DNR;
5101 }
5102
5103 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
5104 trans_len = MIN(sizeof(fw_log) - off, buf_len);
5105
5106 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
5107 }
5108
nvme_error_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5109 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5110 uint64_t off, NvmeRequest *req)
5111 {
5112 uint32_t trans_len;
5113 NvmeErrorLog errlog;
5114
5115 if (off >= sizeof(errlog)) {
5116 return NVME_INVALID_FIELD | NVME_DNR;
5117 }
5118
5119 if (!rae) {
5120 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
5121 }
5122
5123 memset(&errlog, 0x0, sizeof(errlog));
5124 trans_len = MIN(sizeof(errlog) - off, buf_len);
5125
5126 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
5127 }
5128
nvme_changed_nslist(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5129 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5130 uint64_t off, NvmeRequest *req)
5131 {
5132 uint32_t nslist[1024] = {};
5133 uint32_t trans_len;
5134 int i = 0;
5135 uint32_t nsid;
5136
5137 if (off >= sizeof(nslist)) {
5138 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
5139 return NVME_INVALID_FIELD | NVME_DNR;
5140 }
5141
5142 trans_len = MIN(sizeof(nslist) - off, buf_len);
5143
5144 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
5145 NVME_CHANGED_NSID_SIZE) {
5146 /*
5147 * If more than 1024 namespaces, the first entry in the log page should
5148 * be set to FFFFFFFFh and the others to 0 as spec.
5149 */
5150 if (i == ARRAY_SIZE(nslist)) {
5151 memset(nslist, 0x0, sizeof(nslist));
5152 nslist[0] = 0xffffffff;
5153 break;
5154 }
5155
5156 nslist[i++] = nsid;
5157 clear_bit(nsid, n->changed_nsids);
5158 }
5159
5160 /*
5161 * Remove all the remaining list entries in case returns directly due to
5162 * more than 1024 namespaces.
5163 */
5164 if (nslist[0] == 0xffffffff) {
5165 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
5166 }
5167
5168 if (!rae) {
5169 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
5170 }
5171
5172 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
5173 }
5174
nvme_cmd_effects(NvmeCtrl * n,uint8_t csi,uint32_t buf_len,uint64_t off,NvmeRequest * req)5175 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
5176 uint64_t off, NvmeRequest *req)
5177 {
5178 NvmeEffectsLog log = {};
5179 const uint32_t *iocs = NULL;
5180 uint32_t trans_len;
5181
5182 if (off >= sizeof(log)) {
5183 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
5184 return NVME_INVALID_FIELD | NVME_DNR;
5185 }
5186
5187 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
5188 case NVME_CC_CSS_NVM:
5189 iocs = n->cse.iocs.nvm;
5190 break;
5191
5192 case NVME_CC_CSS_ALL:
5193 switch (csi) {
5194 case NVME_CSI_NVM:
5195 iocs = n->cse.iocs.nvm;
5196 break;
5197 case NVME_CSI_ZONED:
5198 iocs = n->cse.iocs.zoned;
5199 break;
5200 }
5201
5202 break;
5203 }
5204
5205 memcpy(log.acs, n->cse.acs, sizeof(log.acs));
5206
5207 if (iocs) {
5208 memcpy(log.iocs, iocs, sizeof(log.iocs));
5209 }
5210
5211 trans_len = MIN(sizeof(log) - off, buf_len);
5212
5213 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
5214 }
5215
nvme_vendor_specific_log(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req,uint8_t lid)5216 static uint16_t nvme_vendor_specific_log(NvmeCtrl *n, uint8_t rae,
5217 uint32_t buf_len, uint64_t off,
5218 NvmeRequest *req, uint8_t lid)
5219 {
5220 switch (lid) {
5221 case NVME_OCP_EXTENDED_SMART_INFO:
5222 if (n->params.ocp) {
5223 return nvme_ocp_extended_smart_info(n, rae, buf_len, off, req);
5224 }
5225 break;
5226 /* add a case for each additional vendor specific log id */
5227 }
5228
5229 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
5230 return NVME_INVALID_FIELD | NVME_DNR;
5231 }
5232
sizeof_fdp_conf_descr(size_t nruh,size_t vss)5233 static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss)
5234 {
5235 size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr)
5236 + vss;
5237 return ROUND_UP(entry_siz, 8);
5238 }
5239
nvme_fdp_confs(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5240 static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5241 uint64_t off, NvmeRequest *req)
5242 {
5243 uint32_t log_size, trans_len;
5244 g_autofree uint8_t *buf = NULL;
5245 NvmeFdpDescrHdr *hdr;
5246 NvmeRuhDescr *ruhd;
5247 NvmeEnduranceGroup *endgrp;
5248 NvmeFdpConfsHdr *log;
5249 size_t nruh, fdp_descr_size;
5250 int i;
5251
5252 if (endgrpid != 1 || !n->subsys) {
5253 return NVME_INVALID_FIELD | NVME_DNR;
5254 }
5255
5256 endgrp = &n->subsys->endgrp;
5257
5258 if (endgrp->fdp.enabled) {
5259 nruh = endgrp->fdp.nruh;
5260 } else {
5261 nruh = 1;
5262 }
5263
5264 fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS);
5265 log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size;
5266
5267 if (off >= log_size) {
5268 return NVME_INVALID_FIELD | NVME_DNR;
5269 }
5270
5271 trans_len = MIN(log_size - off, buf_len);
5272
5273 buf = g_malloc0(log_size);
5274 log = (NvmeFdpConfsHdr *)buf;
5275 hdr = (NvmeFdpDescrHdr *)(log + 1);
5276 ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr));
5277
5278 log->num_confs = cpu_to_le16(0);
5279 log->size = cpu_to_le32(log_size);
5280
5281 hdr->descr_size = cpu_to_le16(fdp_descr_size);
5282 if (endgrp->fdp.enabled) {
5283 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1);
5284 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif);
5285 hdr->nrg = cpu_to_le16(endgrp->fdp.nrg);
5286 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5287 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5288 hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES);
5289 hdr->runs = cpu_to_le64(endgrp->fdp.runs);
5290
5291 for (i = 0; i < nruh; i++) {
5292 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5293 ruhd++;
5294 }
5295 } else {
5296 /* 1 bit for RUH in PIF -> 2 RUHs max. */
5297 hdr->nrg = cpu_to_le16(1);
5298 hdr->nruh = cpu_to_le16(1);
5299 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5300 hdr->nnss = cpu_to_le32(1);
5301 hdr->runs = cpu_to_le64(96 * MiB);
5302
5303 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5304 }
5305
5306 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5307 }
5308
nvme_fdp_ruh_usage(NvmeCtrl * n,uint32_t endgrpid,uint32_t dw10,uint32_t dw12,uint32_t buf_len,uint64_t off,NvmeRequest * req)5309 static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid,
5310 uint32_t dw10, uint32_t dw12,
5311 uint32_t buf_len, uint64_t off,
5312 NvmeRequest *req)
5313 {
5314 NvmeRuHandle *ruh;
5315 NvmeRuhuLog *hdr;
5316 NvmeRuhuDescr *ruhud;
5317 NvmeEnduranceGroup *endgrp;
5318 g_autofree uint8_t *buf = NULL;
5319 uint32_t log_size, trans_len;
5320 uint16_t i;
5321
5322 if (endgrpid != 1 || !n->subsys) {
5323 return NVME_INVALID_FIELD | NVME_DNR;
5324 }
5325
5326 endgrp = &n->subsys->endgrp;
5327
5328 if (!endgrp->fdp.enabled) {
5329 return NVME_FDP_DISABLED | NVME_DNR;
5330 }
5331
5332 log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr);
5333
5334 if (off >= log_size) {
5335 return NVME_INVALID_FIELD | NVME_DNR;
5336 }
5337
5338 trans_len = MIN(log_size - off, buf_len);
5339
5340 buf = g_malloc0(log_size);
5341 hdr = (NvmeRuhuLog *)buf;
5342 ruhud = (NvmeRuhuDescr *)(hdr + 1);
5343
5344 ruh = endgrp->fdp.ruhs;
5345 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5346
5347 for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) {
5348 ruhud->ruha = ruh->ruha;
5349 }
5350
5351 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5352 }
5353
nvme_fdp_stats(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5354 static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5355 uint64_t off, NvmeRequest *req)
5356 {
5357 NvmeEnduranceGroup *endgrp;
5358 NvmeFdpStatsLog log = {};
5359 uint32_t trans_len;
5360
5361 if (off >= sizeof(NvmeFdpStatsLog)) {
5362 return NVME_INVALID_FIELD | NVME_DNR;
5363 }
5364
5365 if (endgrpid != 1 || !n->subsys) {
5366 return NVME_INVALID_FIELD | NVME_DNR;
5367 }
5368
5369 if (!n->subsys->endgrp.fdp.enabled) {
5370 return NVME_FDP_DISABLED | NVME_DNR;
5371 }
5372
5373 endgrp = &n->subsys->endgrp;
5374
5375 trans_len = MIN(sizeof(log) - off, buf_len);
5376
5377 /* spec value is 128 bit, we only use 64 bit */
5378 log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw);
5379 log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw);
5380 log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe);
5381
5382 return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req);
5383 }
5384
nvme_fdp_events(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5385 static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid,
5386 uint32_t buf_len, uint64_t off,
5387 NvmeRequest *req)
5388 {
5389 NvmeEnduranceGroup *endgrp;
5390 NvmeCmd *cmd = &req->cmd;
5391 bool host_events = (cmd->cdw10 >> 8) & 0x1;
5392 uint32_t log_size, trans_len;
5393 NvmeFdpEventBuffer *ebuf;
5394 g_autofree NvmeFdpEventsLog *elog = NULL;
5395 NvmeFdpEvent *event;
5396
5397 if (endgrpid != 1 || !n->subsys) {
5398 return NVME_INVALID_FIELD | NVME_DNR;
5399 }
5400
5401 endgrp = &n->subsys->endgrp;
5402
5403 if (!endgrp->fdp.enabled) {
5404 return NVME_FDP_DISABLED | NVME_DNR;
5405 }
5406
5407 if (host_events) {
5408 ebuf = &endgrp->fdp.host_events;
5409 } else {
5410 ebuf = &endgrp->fdp.ctrl_events;
5411 }
5412
5413 log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
5414
5415 if (off >= log_size) {
5416 return NVME_INVALID_FIELD | NVME_DNR;
5417 }
5418
5419 trans_len = MIN(log_size - off, buf_len);
5420 elog = g_malloc0(log_size);
5421 elog->num_events = cpu_to_le32(ebuf->nelems);
5422 event = (NvmeFdpEvent *)(elog + 1);
5423
5424 if (ebuf->nelems && ebuf->start == ebuf->next) {
5425 unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start);
5426 /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */
5427 memcpy(event, &ebuf->events[ebuf->start],
5428 sizeof(NvmeFdpEvent) * nelems);
5429 memcpy(event + nelems, ebuf->events,
5430 sizeof(NvmeFdpEvent) * ebuf->next);
5431 } else if (ebuf->start < ebuf->next) {
5432 memcpy(event, &ebuf->events[ebuf->start],
5433 sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start));
5434 }
5435
5436 return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req);
5437 }
5438
nvme_get_log(NvmeCtrl * n,NvmeRequest * req)5439 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
5440 {
5441 NvmeCmd *cmd = &req->cmd;
5442
5443 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5444 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5445 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
5446 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
5447 uint8_t lid = dw10 & 0xff;
5448 uint8_t lsp = (dw10 >> 8) & 0xf;
5449 uint8_t rae = (dw10 >> 15) & 0x1;
5450 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
5451 uint32_t numdl, numdu, lspi;
5452 uint64_t off, lpol, lpou;
5453 size_t len;
5454 uint16_t status;
5455
5456 numdl = (dw10 >> 16);
5457 numdu = (dw11 & 0xffff);
5458 lspi = (dw11 >> 16);
5459 lpol = dw12;
5460 lpou = dw13;
5461
5462 len = (((numdu << 16) | numdl) + 1) << 2;
5463 off = (lpou << 32ULL) | lpol;
5464
5465 if (off & 0x3) {
5466 return NVME_INVALID_FIELD | NVME_DNR;
5467 }
5468
5469 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
5470
5471 status = nvme_check_mdts(n, len);
5472 if (status) {
5473 return status;
5474 }
5475
5476 switch (lid) {
5477 case NVME_LOG_ERROR_INFO:
5478 return nvme_error_info(n, rae, len, off, req);
5479 case NVME_LOG_SMART_INFO:
5480 return nvme_smart_info(n, rae, len, off, req);
5481 case NVME_LOG_FW_SLOT_INFO:
5482 return nvme_fw_log_info(n, len, off, req);
5483 case NVME_LOG_VENDOR_START...NVME_LOG_VENDOR_END:
5484 return nvme_vendor_specific_log(n, rae, len, off, req, lid);
5485 case NVME_LOG_CHANGED_NSLIST:
5486 return nvme_changed_nslist(n, rae, len, off, req);
5487 case NVME_LOG_CMD_EFFECTS:
5488 return nvme_cmd_effects(n, csi, len, off, req);
5489 case NVME_LOG_ENDGRP:
5490 return nvme_endgrp_info(n, rae, len, off, req);
5491 case NVME_LOG_FDP_CONFS:
5492 return nvme_fdp_confs(n, lspi, len, off, req);
5493 case NVME_LOG_FDP_RUH_USAGE:
5494 return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req);
5495 case NVME_LOG_FDP_STATS:
5496 return nvme_fdp_stats(n, lspi, len, off, req);
5497 case NVME_LOG_FDP_EVENTS:
5498 return nvme_fdp_events(n, lspi, len, off, req);
5499 default:
5500 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
5501 return NVME_INVALID_FIELD | NVME_DNR;
5502 }
5503 }
5504
nvme_free_cq(NvmeCQueue * cq,NvmeCtrl * n)5505 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
5506 {
5507 PCIDevice *pci = PCI_DEVICE(n);
5508 uint16_t offset = (cq->cqid << 3) + (1 << 2);
5509
5510 n->cq[cq->cqid] = NULL;
5511 qemu_bh_delete(cq->bh);
5512 if (cq->ioeventfd_enabled) {
5513 memory_region_del_eventfd(&n->iomem,
5514 0x1000 + offset, 4, false, 0, &cq->notifier);
5515 event_notifier_set_handler(&cq->notifier, NULL);
5516 event_notifier_cleanup(&cq->notifier);
5517 }
5518 if (msix_enabled(pci) && cq->irq_enabled) {
5519 msix_vector_unuse(pci, cq->vector);
5520 }
5521 if (cq->cqid) {
5522 g_free(cq);
5523 }
5524 }
5525
nvme_del_cq(NvmeCtrl * n,NvmeRequest * req)5526 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
5527 {
5528 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
5529 NvmeCQueue *cq;
5530 uint16_t qid = le16_to_cpu(c->qid);
5531
5532 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
5533 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
5534 return NVME_INVALID_CQID | NVME_DNR;
5535 }
5536
5537 cq = n->cq[qid];
5538 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
5539 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
5540 return NVME_INVALID_QUEUE_DEL;
5541 }
5542
5543 if (cq->irq_enabled && cq->tail != cq->head) {
5544 n->cq_pending--;
5545 }
5546
5547 nvme_irq_deassert(n, cq);
5548 trace_pci_nvme_del_cq(qid);
5549 nvme_free_cq(cq, n);
5550 return NVME_SUCCESS;
5551 }
5552
nvme_init_cq(NvmeCQueue * cq,NvmeCtrl * n,uint64_t dma_addr,uint16_t cqid,uint16_t vector,uint16_t size,uint16_t irq_enabled)5553 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
5554 uint16_t cqid, uint16_t vector, uint16_t size,
5555 uint16_t irq_enabled)
5556 {
5557 PCIDevice *pci = PCI_DEVICE(n);
5558
5559 if (msix_enabled(pci) && irq_enabled) {
5560 msix_vector_use(pci, vector);
5561 }
5562
5563 cq->ctrl = n;
5564 cq->cqid = cqid;
5565 cq->size = size;
5566 cq->dma_addr = dma_addr;
5567 cq->phase = 1;
5568 cq->irq_enabled = irq_enabled;
5569 cq->vector = vector;
5570 cq->head = cq->tail = 0;
5571 QTAILQ_INIT(&cq->req_list);
5572 QTAILQ_INIT(&cq->sq_list);
5573 if (n->dbbuf_enabled) {
5574 cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
5575 cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
5576
5577 if (n->params.ioeventfd && cqid != 0) {
5578 if (!nvme_init_cq_ioeventfd(cq)) {
5579 cq->ioeventfd_enabled = true;
5580 }
5581 }
5582 }
5583 n->cq[cqid] = cq;
5584 cq->bh = qemu_bh_new_guarded(nvme_post_cqes, cq,
5585 &DEVICE(cq->ctrl)->mem_reentrancy_guard);
5586 }
5587
nvme_create_cq(NvmeCtrl * n,NvmeRequest * req)5588 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
5589 {
5590 NvmeCQueue *cq;
5591 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
5592 uint16_t cqid = le16_to_cpu(c->cqid);
5593 uint16_t vector = le16_to_cpu(c->irq_vector);
5594 uint16_t qsize = le16_to_cpu(c->qsize);
5595 uint16_t qflags = le16_to_cpu(c->cq_flags);
5596 uint64_t prp1 = le64_to_cpu(c->prp1);
5597 uint32_t cc = ldq_le_p(&n->bar.cc);
5598 uint8_t iocqes = NVME_CC_IOCQES(cc);
5599 uint8_t iosqes = NVME_CC_IOSQES(cc);
5600
5601 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
5602 NVME_CQ_FLAGS_IEN(qflags) != 0);
5603
5604 if (iosqes != NVME_SQES || iocqes != NVME_CQES) {
5605 trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes);
5606 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5607 }
5608
5609 if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
5610 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
5611 return NVME_INVALID_QID | NVME_DNR;
5612 }
5613 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
5614 trace_pci_nvme_err_invalid_create_cq_size(qsize);
5615 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5616 }
5617 if (unlikely(prp1 & (n->page_size - 1))) {
5618 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
5619 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
5620 }
5621 if (unlikely(!msix_enabled(PCI_DEVICE(n)) && vector)) {
5622 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5623 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5624 }
5625 if (unlikely(vector >= n->conf_msix_qsize)) {
5626 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5627 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5628 }
5629 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
5630 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
5631 return NVME_INVALID_FIELD | NVME_DNR;
5632 }
5633
5634 cq = g_malloc0(sizeof(*cq));
5635 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
5636 NVME_CQ_FLAGS_IEN(qflags));
5637
5638 /*
5639 * It is only required to set qs_created when creating a completion queue;
5640 * creating a submission queue without a matching completion queue will
5641 * fail.
5642 */
5643 n->qs_created = true;
5644 return NVME_SUCCESS;
5645 }
5646
nvme_rpt_empty_id_struct(NvmeCtrl * n,NvmeRequest * req)5647 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
5648 {
5649 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5650
5651 return nvme_c2h(n, id, sizeof(id), req);
5652 }
5653
nvme_identify_ctrl(NvmeCtrl * n,NvmeRequest * req)5654 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
5655 {
5656 trace_pci_nvme_identify_ctrl();
5657
5658 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
5659 }
5660
nvme_identify_ctrl_csi(NvmeCtrl * n,NvmeRequest * req)5661 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
5662 {
5663 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5664 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5665 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
5666
5667 trace_pci_nvme_identify_ctrl_csi(c->csi);
5668
5669 switch (c->csi) {
5670 case NVME_CSI_NVM:
5671 id_nvm->vsl = n->params.vsl;
5672 id_nvm->dmrl = NVME_ID_CTRL_NVM_DMRL_MAX;
5673 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
5674 id_nvm->dmsl = NVME_ID_CTRL_NVM_DMRL_MAX * n->dmrsl;
5675 break;
5676
5677 case NVME_CSI_ZONED:
5678 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
5679 break;
5680
5681 default:
5682 return NVME_INVALID_FIELD | NVME_DNR;
5683 }
5684
5685 return nvme_c2h(n, id, sizeof(id), req);
5686 }
5687
nvme_identify_ns(NvmeCtrl * n,NvmeRequest * req,bool active)5688 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
5689 {
5690 NvmeNamespace *ns;
5691 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5692 uint32_t nsid = le32_to_cpu(c->nsid);
5693
5694 trace_pci_nvme_identify_ns(nsid);
5695
5696 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5697 return NVME_INVALID_NSID | NVME_DNR;
5698 }
5699
5700 ns = nvme_ns(n, nsid);
5701 if (unlikely(!ns)) {
5702 if (!active) {
5703 ns = nvme_subsys_ns(n->subsys, nsid);
5704 if (!ns) {
5705 return nvme_rpt_empty_id_struct(n, req);
5706 }
5707 } else {
5708 return nvme_rpt_empty_id_struct(n, req);
5709 }
5710 }
5711
5712 if (active || ns->csi == NVME_CSI_NVM) {
5713 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
5714 }
5715
5716 return NVME_INVALID_IOCS | NVME_DNR;
5717 }
5718
nvme_identify_ctrl_list(NvmeCtrl * n,NvmeRequest * req,bool attached)5719 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
5720 bool attached)
5721 {
5722 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5723 uint32_t nsid = le32_to_cpu(c->nsid);
5724 uint16_t min_id = le16_to_cpu(c->ctrlid);
5725 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5726 uint16_t *ids = &list[1];
5727 NvmeNamespace *ns;
5728 NvmeCtrl *ctrl;
5729 int cntlid, nr_ids = 0;
5730
5731 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
5732
5733 if (!n->subsys) {
5734 return NVME_INVALID_FIELD | NVME_DNR;
5735 }
5736
5737 if (attached) {
5738 if (nsid == NVME_NSID_BROADCAST) {
5739 return NVME_INVALID_FIELD | NVME_DNR;
5740 }
5741
5742 ns = nvme_subsys_ns(n->subsys, nsid);
5743 if (!ns) {
5744 return NVME_INVALID_FIELD | NVME_DNR;
5745 }
5746 }
5747
5748 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
5749 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
5750 if (!ctrl) {
5751 continue;
5752 }
5753
5754 if (attached && !nvme_ns(ctrl, nsid)) {
5755 continue;
5756 }
5757
5758 ids[nr_ids++] = cntlid;
5759 }
5760
5761 list[0] = nr_ids;
5762
5763 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
5764 }
5765
nvme_identify_pri_ctrl_cap(NvmeCtrl * n,NvmeRequest * req)5766 static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
5767 {
5768 trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
5769
5770 return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap,
5771 sizeof(NvmePriCtrlCap), req);
5772 }
5773
nvme_identify_sec_ctrl_list(NvmeCtrl * n,NvmeRequest * req)5774 static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
5775 {
5776 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5777 uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
5778 uint16_t min_id = le16_to_cpu(c->ctrlid);
5779 uint8_t num_sec_ctrl = n->nr_sec_ctrls;
5780 NvmeSecCtrlList list = {0};
5781 uint8_t i;
5782
5783 for (i = 0; i < num_sec_ctrl; i++) {
5784 if (n->sec_ctrl_list[i].scid >= min_id) {
5785 list.numcntl = MIN(num_sec_ctrl - i, 127);
5786 memcpy(&list.sec, n->sec_ctrl_list + i,
5787 list.numcntl * sizeof(NvmeSecCtrlEntry));
5788 break;
5789 }
5790 }
5791
5792 trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
5793
5794 return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req);
5795 }
5796
nvme_identify_ns_ind(NvmeCtrl * n,NvmeRequest * req,bool alloc)5797 static uint16_t nvme_identify_ns_ind(NvmeCtrl *n, NvmeRequest *req, bool alloc)
5798 {
5799 NvmeNamespace *ns;
5800 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5801 uint32_t nsid = le32_to_cpu(c->nsid);
5802
5803 trace_pci_nvme_identify_ns_ind(nsid);
5804
5805 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5806 return NVME_INVALID_NSID | NVME_DNR;
5807 }
5808
5809 ns = nvme_ns(n, nsid);
5810 if (unlikely(!ns)) {
5811 if (alloc) {
5812 ns = nvme_subsys_ns(n->subsys, nsid);
5813 if (!ns) {
5814 return nvme_rpt_empty_id_struct(n, req);
5815 }
5816 } else {
5817 return nvme_rpt_empty_id_struct(n, req);
5818 }
5819 }
5820
5821 return nvme_c2h(n, (uint8_t *)&ns->id_ns_ind, sizeof(NvmeIdNsInd), req);
5822 }
5823
nvme_identify_ns_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5824 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
5825 bool active)
5826 {
5827 NvmeNamespace *ns;
5828 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5829 uint32_t nsid = le32_to_cpu(c->nsid);
5830
5831 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
5832
5833 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5834 return NVME_INVALID_NSID | NVME_DNR;
5835 }
5836
5837 ns = nvme_ns(n, nsid);
5838 if (unlikely(!ns)) {
5839 if (!active) {
5840 ns = nvme_subsys_ns(n->subsys, nsid);
5841 if (!ns) {
5842 return nvme_rpt_empty_id_struct(n, req);
5843 }
5844 } else {
5845 return nvme_rpt_empty_id_struct(n, req);
5846 }
5847 }
5848
5849 if (c->csi == NVME_CSI_NVM) {
5850 return nvme_c2h(n, (uint8_t *)&ns->id_ns_nvm, sizeof(NvmeIdNsNvm),
5851 req);
5852 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
5853 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
5854 req);
5855 }
5856
5857 return NVME_INVALID_FIELD | NVME_DNR;
5858 }
5859
nvme_identify_nslist(NvmeCtrl * n,NvmeRequest * req,bool active)5860 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
5861 bool active)
5862 {
5863 NvmeNamespace *ns;
5864 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5865 uint32_t min_nsid = le32_to_cpu(c->nsid);
5866 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5867 static const int data_len = sizeof(list);
5868 uint32_t *list_ptr = (uint32_t *)list;
5869 int i, j = 0;
5870
5871 trace_pci_nvme_identify_nslist(min_nsid);
5872
5873 /*
5874 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
5875 * since the Active Namespace ID List should return namespaces with ids
5876 * *higher* than the NSID specified in the command. This is also specified
5877 * in the spec (NVM Express v1.3d, Section 5.15.4).
5878 */
5879 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5880 return NVME_INVALID_NSID | NVME_DNR;
5881 }
5882
5883 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5884 ns = nvme_ns(n, i);
5885 if (!ns) {
5886 if (!active) {
5887 ns = nvme_subsys_ns(n->subsys, i);
5888 if (!ns) {
5889 continue;
5890 }
5891 } else {
5892 continue;
5893 }
5894 }
5895 if (ns->params.nsid <= min_nsid) {
5896 continue;
5897 }
5898 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5899 if (j == data_len / sizeof(uint32_t)) {
5900 break;
5901 }
5902 }
5903
5904 return nvme_c2h(n, list, data_len, req);
5905 }
5906
nvme_identify_nslist_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5907 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
5908 bool active)
5909 {
5910 NvmeNamespace *ns;
5911 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5912 uint32_t min_nsid = le32_to_cpu(c->nsid);
5913 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5914 static const int data_len = sizeof(list);
5915 uint32_t *list_ptr = (uint32_t *)list;
5916 int i, j = 0;
5917
5918 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
5919
5920 /*
5921 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
5922 */
5923 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5924 return NVME_INVALID_NSID | NVME_DNR;
5925 }
5926
5927 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
5928 return NVME_INVALID_FIELD | NVME_DNR;
5929 }
5930
5931 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5932 ns = nvme_ns(n, i);
5933 if (!ns) {
5934 if (!active) {
5935 ns = nvme_subsys_ns(n->subsys, i);
5936 if (!ns) {
5937 continue;
5938 }
5939 } else {
5940 continue;
5941 }
5942 }
5943 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
5944 continue;
5945 }
5946 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5947 if (j == data_len / sizeof(uint32_t)) {
5948 break;
5949 }
5950 }
5951
5952 return nvme_c2h(n, list, data_len, req);
5953 }
5954
nvme_endurance_group_list(NvmeCtrl * n,NvmeRequest * req)5955 static uint16_t nvme_endurance_group_list(NvmeCtrl *n, NvmeRequest *req)
5956 {
5957 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5958 uint16_t *nr_ids = &list[0];
5959 uint16_t *ids = &list[1];
5960 uint16_t endgid = le32_to_cpu(req->cmd.cdw11) & 0xffff;
5961
5962 /*
5963 * The current nvme-subsys only supports Endurance Group #1.
5964 */
5965 if (!endgid) {
5966 *nr_ids = 1;
5967 ids[0] = 1;
5968 } else {
5969 *nr_ids = 0;
5970 }
5971
5972 return nvme_c2h(n, list, sizeof(list), req);
5973 }
5974
nvme_identify_ns_descr_list(NvmeCtrl * n,NvmeRequest * req)5975 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
5976 {
5977 NvmeNamespace *ns;
5978 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5979 uint32_t nsid = le32_to_cpu(c->nsid);
5980 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5981 uint8_t *pos = list;
5982 struct {
5983 NvmeIdNsDescr hdr;
5984 uint8_t v[NVME_NIDL_UUID];
5985 } QEMU_PACKED uuid = {};
5986 struct {
5987 NvmeIdNsDescr hdr;
5988 uint8_t v[NVME_NIDL_NGUID];
5989 } QEMU_PACKED nguid = {};
5990 struct {
5991 NvmeIdNsDescr hdr;
5992 uint64_t v;
5993 } QEMU_PACKED eui64 = {};
5994 struct {
5995 NvmeIdNsDescr hdr;
5996 uint8_t v;
5997 } QEMU_PACKED csi = {};
5998
5999 trace_pci_nvme_identify_ns_descr_list(nsid);
6000
6001 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
6002 return NVME_INVALID_NSID | NVME_DNR;
6003 }
6004
6005 ns = nvme_ns(n, nsid);
6006 if (unlikely(!ns)) {
6007 return NVME_INVALID_FIELD | NVME_DNR;
6008 }
6009
6010 if (!qemu_uuid_is_null(&ns->params.uuid)) {
6011 uuid.hdr.nidt = NVME_NIDT_UUID;
6012 uuid.hdr.nidl = NVME_NIDL_UUID;
6013 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
6014 memcpy(pos, &uuid, sizeof(uuid));
6015 pos += sizeof(uuid);
6016 }
6017
6018 if (!nvme_nguid_is_null(&ns->params.nguid)) {
6019 nguid.hdr.nidt = NVME_NIDT_NGUID;
6020 nguid.hdr.nidl = NVME_NIDL_NGUID;
6021 memcpy(nguid.v, ns->params.nguid.data, NVME_NIDL_NGUID);
6022 memcpy(pos, &nguid, sizeof(nguid));
6023 pos += sizeof(nguid);
6024 }
6025
6026 if (ns->params.eui64) {
6027 eui64.hdr.nidt = NVME_NIDT_EUI64;
6028 eui64.hdr.nidl = NVME_NIDL_EUI64;
6029 eui64.v = cpu_to_be64(ns->params.eui64);
6030 memcpy(pos, &eui64, sizeof(eui64));
6031 pos += sizeof(eui64);
6032 }
6033
6034 csi.hdr.nidt = NVME_NIDT_CSI;
6035 csi.hdr.nidl = NVME_NIDL_CSI;
6036 csi.v = ns->csi;
6037 memcpy(pos, &csi, sizeof(csi));
6038 pos += sizeof(csi);
6039
6040 return nvme_c2h(n, list, sizeof(list), req);
6041 }
6042
nvme_identify_cmd_set(NvmeCtrl * n,NvmeRequest * req)6043 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
6044 {
6045 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
6046 static const int data_len = sizeof(list);
6047
6048 trace_pci_nvme_identify_cmd_set();
6049
6050 NVME_SET_CSI(*list, NVME_CSI_NVM);
6051 NVME_SET_CSI(*list, NVME_CSI_ZONED);
6052
6053 return nvme_c2h(n, list, data_len, req);
6054 }
6055
nvme_identify(NvmeCtrl * n,NvmeRequest * req)6056 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
6057 {
6058 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
6059
6060 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
6061 c->csi);
6062
6063 switch (c->cns) {
6064 case NVME_ID_CNS_NS:
6065 return nvme_identify_ns(n, req, true);
6066 case NVME_ID_CNS_NS_PRESENT:
6067 return nvme_identify_ns(n, req, false);
6068 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
6069 return nvme_identify_ctrl_list(n, req, true);
6070 case NVME_ID_CNS_CTRL_LIST:
6071 return nvme_identify_ctrl_list(n, req, false);
6072 case NVME_ID_CNS_PRIMARY_CTRL_CAP:
6073 return nvme_identify_pri_ctrl_cap(n, req);
6074 case NVME_ID_CNS_SECONDARY_CTRL_LIST:
6075 return nvme_identify_sec_ctrl_list(n, req);
6076 case NVME_ID_CNS_CS_NS:
6077 return nvme_identify_ns_csi(n, req, true);
6078 case NVME_ID_CNS_CS_IND_NS:
6079 return nvme_identify_ns_ind(n, req, false);
6080 case NVME_ID_CNS_CS_IND_NS_ALLOCATED:
6081 return nvme_identify_ns_ind(n, req, true);
6082 case NVME_ID_CNS_CS_NS_PRESENT:
6083 return nvme_identify_ns_csi(n, req, false);
6084 case NVME_ID_CNS_CTRL:
6085 return nvme_identify_ctrl(n, req);
6086 case NVME_ID_CNS_CS_CTRL:
6087 return nvme_identify_ctrl_csi(n, req);
6088 case NVME_ID_CNS_NS_ACTIVE_LIST:
6089 return nvme_identify_nslist(n, req, true);
6090 case NVME_ID_CNS_NS_PRESENT_LIST:
6091 return nvme_identify_nslist(n, req, false);
6092 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
6093 return nvme_identify_nslist_csi(n, req, true);
6094 case NVME_ID_CNS_ENDURANCE_GROUP_LIST:
6095 return nvme_endurance_group_list(n, req);
6096 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
6097 return nvme_identify_nslist_csi(n, req, false);
6098 case NVME_ID_CNS_NS_DESCR_LIST:
6099 return nvme_identify_ns_descr_list(n, req);
6100 case NVME_ID_CNS_IO_COMMAND_SET:
6101 return nvme_identify_cmd_set(n, req);
6102 default:
6103 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
6104 return NVME_INVALID_FIELD | NVME_DNR;
6105 }
6106 }
6107
nvme_abort(NvmeCtrl * n,NvmeRequest * req)6108 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
6109 {
6110 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
6111 uint16_t cid = (le32_to_cpu(req->cmd.cdw10) >> 16) & 0xffff;
6112 NvmeSQueue *sq = n->sq[sqid];
6113 NvmeRequest *r, *next;
6114 int i;
6115
6116 req->cqe.result = 1;
6117 if (nvme_check_sqid(n, sqid)) {
6118 return NVME_INVALID_FIELD | NVME_DNR;
6119 }
6120
6121 if (sqid == 0) {
6122 for (i = 0; i < n->outstanding_aers; i++) {
6123 NvmeRequest *re = n->aer_reqs[i];
6124 if (re->cqe.cid == cid) {
6125 memmove(n->aer_reqs + i, n->aer_reqs + i + 1,
6126 (n->outstanding_aers - i - 1) * sizeof(NvmeRequest *));
6127 n->outstanding_aers--;
6128 re->status = NVME_CMD_ABORT_REQ;
6129 req->cqe.result = 0;
6130 nvme_enqueue_req_completion(&n->admin_cq, re);
6131 return NVME_SUCCESS;
6132 }
6133 }
6134 }
6135
6136 QTAILQ_FOREACH_SAFE(r, &sq->out_req_list, entry, next) {
6137 if (r->cqe.cid == cid) {
6138 if (r->aiocb) {
6139 r->status = NVME_CMD_ABORT_REQ;
6140 blk_aio_cancel_async(r->aiocb);
6141 }
6142 break;
6143 }
6144 }
6145
6146 return NVME_SUCCESS;
6147 }
6148
nvme_set_timestamp(NvmeCtrl * n,uint64_t ts)6149 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
6150 {
6151 trace_pci_nvme_setfeat_timestamp(ts);
6152
6153 n->host_timestamp = le64_to_cpu(ts);
6154 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6155 }
6156
nvme_get_timestamp(const NvmeCtrl * n)6157 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
6158 {
6159 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6160 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
6161
6162 union nvme_timestamp {
6163 struct {
6164 uint64_t timestamp:48;
6165 uint64_t sync:1;
6166 uint64_t origin:3;
6167 uint64_t rsvd1:12;
6168 };
6169 uint64_t all;
6170 };
6171
6172 union nvme_timestamp ts;
6173 ts.all = 0;
6174 ts.timestamp = n->host_timestamp + elapsed_time;
6175
6176 /* If the host timestamp is non-zero, set the timestamp origin */
6177 ts.origin = n->host_timestamp ? 0x01 : 0x00;
6178
6179 trace_pci_nvme_getfeat_timestamp(ts.all);
6180
6181 return cpu_to_le64(ts.all);
6182 }
6183
nvme_get_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6184 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6185 {
6186 uint64_t timestamp = nvme_get_timestamp(n);
6187
6188 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req);
6189 }
6190
nvme_get_feature_fdp(NvmeCtrl * n,uint32_t endgrpid,uint32_t * result)6191 static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid,
6192 uint32_t *result)
6193 {
6194 *result = 0;
6195
6196 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6197 return NVME_INVALID_FIELD | NVME_DNR;
6198 }
6199
6200 *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1);
6201 *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0);
6202
6203 return NVME_SUCCESS;
6204 }
6205
nvme_get_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req,uint32_t * result)6206 static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6207 NvmeRequest *req, uint32_t *result)
6208 {
6209 NvmeCmd *cmd = &req->cmd;
6210 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6211 uint16_t ph = cdw11 & 0xffff;
6212 uint8_t noet = (cdw11 >> 16) & 0xff;
6213 uint16_t ruhid, ret;
6214 uint32_t nentries = 0;
6215 uint8_t s_events_ndx = 0;
6216 size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet;
6217 g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz);
6218 NvmeRuHandle *ruh;
6219 NvmeFdpEventDescr *s_event;
6220
6221 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6222 return NVME_FDP_DISABLED | NVME_DNR;
6223 }
6224
6225 if (!nvme_ph_valid(ns, ph)) {
6226 return NVME_INVALID_FIELD | NVME_DNR;
6227 }
6228
6229 ruhid = ns->fdp.phs[ph];
6230 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6231
6232 assert(ruh);
6233
6234 if (unlikely(noet == 0)) {
6235 return NVME_INVALID_FIELD | NVME_DNR;
6236 }
6237
6238 for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) {
6239 uint8_t shift = nvme_fdp_evf_shifts[event_type];
6240 if (!shift && event_type) {
6241 /*
6242 * only first entry (event_type == 0) has a shift value of 0
6243 * other entries are simply unpopulated.
6244 */
6245 continue;
6246 }
6247
6248 nentries++;
6249
6250 s_event = &s_events[s_events_ndx];
6251 s_event->evt = event_type;
6252 s_event->evta = (ruh->event_filter >> shift) & 0x1;
6253
6254 /* break if all `noet` entries are filled */
6255 if ((++s_events_ndx) == noet) {
6256 break;
6257 }
6258 }
6259
6260 ret = nvme_c2h(n, s_events, s_events_siz, req);
6261 if (ret) {
6262 return ret;
6263 }
6264
6265 *result = nentries;
6266 return NVME_SUCCESS;
6267 }
6268
nvme_get_feature(NvmeCtrl * n,NvmeRequest * req)6269 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
6270 {
6271 NvmeCmd *cmd = &req->cmd;
6272 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6273 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6274 uint32_t nsid = le32_to_cpu(cmd->nsid);
6275 uint32_t result = 0;
6276 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6277 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
6278 uint16_t iv;
6279 NvmeNamespace *ns;
6280 int i;
6281 uint16_t endgrpid = 0, ret = NVME_SUCCESS;
6282
6283 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
6284 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
6285 };
6286
6287 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
6288
6289 if (!nvme_feature_support[fid]) {
6290 return NVME_INVALID_FIELD | NVME_DNR;
6291 }
6292
6293 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6294 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
6295 /*
6296 * The Reservation Notification Mask and Reservation Persistence
6297 * features require a status code of Invalid Field in Command when
6298 * NSID is FFFFFFFFh. Since the device does not support those
6299 * features we can always return Invalid Namespace or Format as we
6300 * should do for all other features.
6301 */
6302 return NVME_INVALID_NSID | NVME_DNR;
6303 }
6304
6305 if (!nvme_ns(n, nsid)) {
6306 return NVME_INVALID_FIELD | NVME_DNR;
6307 }
6308 }
6309
6310 switch (sel) {
6311 case NVME_GETFEAT_SELECT_CURRENT:
6312 break;
6313 case NVME_GETFEAT_SELECT_SAVED:
6314 /* no features are saveable by the controller; fallthrough */
6315 case NVME_GETFEAT_SELECT_DEFAULT:
6316 goto defaults;
6317 case NVME_GETFEAT_SELECT_CAP:
6318 result = nvme_feature_cap[fid];
6319 goto out;
6320 }
6321
6322 switch (fid) {
6323 case NVME_TEMPERATURE_THRESHOLD:
6324 result = 0;
6325
6326 /*
6327 * The controller only implements the Composite Temperature sensor, so
6328 * return 0 for all other sensors.
6329 */
6330 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6331 goto out;
6332 }
6333
6334 switch (NVME_TEMP_THSEL(dw11)) {
6335 case NVME_TEMP_THSEL_OVER:
6336 result = n->features.temp_thresh_hi;
6337 goto out;
6338 case NVME_TEMP_THSEL_UNDER:
6339 result = n->features.temp_thresh_low;
6340 goto out;
6341 }
6342
6343 return NVME_INVALID_FIELD | NVME_DNR;
6344 case NVME_ERROR_RECOVERY:
6345 if (!nvme_nsid_valid(n, nsid)) {
6346 return NVME_INVALID_NSID | NVME_DNR;
6347 }
6348
6349 ns = nvme_ns(n, nsid);
6350 if (unlikely(!ns)) {
6351 return NVME_INVALID_FIELD | NVME_DNR;
6352 }
6353
6354 result = ns->features.err_rec;
6355 goto out;
6356 case NVME_VOLATILE_WRITE_CACHE:
6357 result = 0;
6358 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6359 ns = nvme_ns(n, i);
6360 if (!ns) {
6361 continue;
6362 }
6363
6364 result = blk_enable_write_cache(ns->blkconf.blk);
6365 if (result) {
6366 break;
6367 }
6368 }
6369 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
6370 goto out;
6371 case NVME_ASYNCHRONOUS_EVENT_CONF:
6372 result = n->features.async_config;
6373 goto out;
6374 case NVME_TIMESTAMP:
6375 return nvme_get_feature_timestamp(n, req);
6376 case NVME_HOST_BEHAVIOR_SUPPORT:
6377 return nvme_c2h(n, (uint8_t *)&n->features.hbs,
6378 sizeof(n->features.hbs), req);
6379 case NVME_FDP_MODE:
6380 endgrpid = dw11 & 0xff;
6381
6382 if (endgrpid != 0x1) {
6383 return NVME_INVALID_FIELD | NVME_DNR;
6384 }
6385
6386 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6387 if (ret) {
6388 return ret;
6389 }
6390 goto out;
6391 case NVME_FDP_EVENTS:
6392 if (!nvme_nsid_valid(n, nsid)) {
6393 return NVME_INVALID_NSID | NVME_DNR;
6394 }
6395
6396 ns = nvme_ns(n, nsid);
6397 if (unlikely(!ns)) {
6398 return NVME_INVALID_FIELD | NVME_DNR;
6399 }
6400
6401 ret = nvme_get_feature_fdp_events(n, ns, req, &result);
6402 if (ret) {
6403 return ret;
6404 }
6405 goto out;
6406 default:
6407 break;
6408 }
6409
6410 defaults:
6411 switch (fid) {
6412 case NVME_TEMPERATURE_THRESHOLD:
6413 result = 0;
6414
6415 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6416 break;
6417 }
6418
6419 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
6420 result = NVME_TEMPERATURE_WARNING;
6421 }
6422
6423 break;
6424 case NVME_NUMBER_OF_QUEUES:
6425 result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
6426 trace_pci_nvme_getfeat_numq(result);
6427 break;
6428 case NVME_INTERRUPT_VECTOR_CONF:
6429 iv = dw11 & 0xffff;
6430 if (iv >= n->conf_ioqpairs + 1) {
6431 return NVME_INVALID_FIELD | NVME_DNR;
6432 }
6433
6434 result = iv;
6435 if (iv == n->admin_cq.vector) {
6436 result |= NVME_INTVC_NOCOALESCING;
6437 }
6438 break;
6439 case NVME_FDP_MODE:
6440 endgrpid = dw11 & 0xff;
6441
6442 if (endgrpid != 0x1) {
6443 return NVME_INVALID_FIELD | NVME_DNR;
6444 }
6445
6446 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6447 if (ret) {
6448 return ret;
6449 }
6450 break;
6451
6452 case NVME_WRITE_ATOMICITY:
6453 result = n->dn;
6454 break;
6455 default:
6456 result = nvme_feature_default[fid];
6457 break;
6458 }
6459
6460 out:
6461 req->cqe.result = cpu_to_le32(result);
6462 return ret;
6463 }
6464
nvme_set_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6465 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6466 {
6467 uint16_t ret;
6468 uint64_t timestamp;
6469
6470 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req);
6471 if (ret) {
6472 return ret;
6473 }
6474
6475 nvme_set_timestamp(n, timestamp);
6476
6477 return NVME_SUCCESS;
6478 }
6479
nvme_set_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req)6480 static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6481 NvmeRequest *req)
6482 {
6483 NvmeCmd *cmd = &req->cmd;
6484 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6485 uint16_t ph = cdw11 & 0xffff;
6486 uint8_t noet = (cdw11 >> 16) & 0xff;
6487 uint16_t ret, ruhid;
6488 uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1;
6489 uint8_t event_mask = 0;
6490 unsigned int i;
6491 g_autofree uint8_t *events = g_malloc0(noet);
6492 NvmeRuHandle *ruh = NULL;
6493
6494 assert(ns);
6495
6496 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6497 return NVME_FDP_DISABLED | NVME_DNR;
6498 }
6499
6500 if (!nvme_ph_valid(ns, ph)) {
6501 return NVME_INVALID_FIELD | NVME_DNR;
6502 }
6503
6504 ruhid = ns->fdp.phs[ph];
6505 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6506
6507 ret = nvme_h2c(n, events, noet, req);
6508 if (ret) {
6509 return ret;
6510 }
6511
6512 for (i = 0; i < noet; i++) {
6513 event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]);
6514 }
6515
6516 if (enable) {
6517 ruh->event_filter |= event_mask;
6518 } else {
6519 ruh->event_filter = ruh->event_filter & ~event_mask;
6520 }
6521
6522 return NVME_SUCCESS;
6523 }
6524
nvme_set_feature(NvmeCtrl * n,NvmeRequest * req)6525 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
6526 {
6527 NvmeNamespace *ns = NULL;
6528
6529 NvmeCmd *cmd = &req->cmd;
6530 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6531 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6532 uint32_t nsid = le32_to_cpu(cmd->nsid);
6533 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6534 uint8_t save = NVME_SETFEAT_SAVE(dw10);
6535 uint16_t status;
6536 int i;
6537 NvmeIdCtrl *id = &n->id_ctrl;
6538 NvmeAtomic *atomic = &n->atomic;
6539
6540 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
6541
6542 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
6543 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
6544 }
6545
6546 if (!nvme_feature_support[fid]) {
6547 return NVME_INVALID_FIELD | NVME_DNR;
6548 }
6549
6550 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6551 if (nsid != NVME_NSID_BROADCAST) {
6552 if (!nvme_nsid_valid(n, nsid)) {
6553 return NVME_INVALID_NSID | NVME_DNR;
6554 }
6555
6556 ns = nvme_ns(n, nsid);
6557 if (unlikely(!ns)) {
6558 return NVME_INVALID_FIELD | NVME_DNR;
6559 }
6560 }
6561 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
6562 if (!nvme_nsid_valid(n, nsid)) {
6563 return NVME_INVALID_NSID | NVME_DNR;
6564 }
6565
6566 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
6567 }
6568
6569 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
6570 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6571 }
6572
6573 switch (fid) {
6574 case NVME_TEMPERATURE_THRESHOLD:
6575 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6576 break;
6577 }
6578
6579 switch (NVME_TEMP_THSEL(dw11)) {
6580 case NVME_TEMP_THSEL_OVER:
6581 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
6582 break;
6583 case NVME_TEMP_THSEL_UNDER:
6584 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
6585 break;
6586 default:
6587 return NVME_INVALID_FIELD | NVME_DNR;
6588 }
6589
6590 if ((n->temperature >= n->features.temp_thresh_hi) ||
6591 (n->temperature <= n->features.temp_thresh_low)) {
6592 nvme_smart_event(n, NVME_SMART_TEMPERATURE);
6593 }
6594
6595 break;
6596 case NVME_ERROR_RECOVERY:
6597 if (nsid == NVME_NSID_BROADCAST) {
6598 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6599 ns = nvme_ns(n, i);
6600
6601 if (!ns) {
6602 continue;
6603 }
6604
6605 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6606 ns->features.err_rec = dw11;
6607 }
6608 }
6609
6610 break;
6611 }
6612
6613 assert(ns);
6614 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6615 ns->features.err_rec = dw11;
6616 }
6617 break;
6618 case NVME_VOLATILE_WRITE_CACHE:
6619 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6620 ns = nvme_ns(n, i);
6621 if (!ns) {
6622 continue;
6623 }
6624
6625 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
6626 blk_flush(ns->blkconf.blk);
6627 }
6628
6629 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
6630 }
6631
6632 break;
6633
6634 case NVME_NUMBER_OF_QUEUES:
6635 if (n->qs_created) {
6636 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6637 }
6638
6639 /*
6640 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
6641 * and NSQR.
6642 */
6643 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
6644 return NVME_INVALID_FIELD | NVME_DNR;
6645 }
6646
6647 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
6648 ((dw11 >> 16) & 0xffff) + 1,
6649 n->conf_ioqpairs,
6650 n->conf_ioqpairs);
6651 req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
6652 ((n->conf_ioqpairs - 1) << 16));
6653 break;
6654 case NVME_ASYNCHRONOUS_EVENT_CONF:
6655 n->features.async_config = dw11;
6656 break;
6657 case NVME_TIMESTAMP:
6658 return nvme_set_feature_timestamp(n, req);
6659 case NVME_HOST_BEHAVIOR_SUPPORT:
6660 status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
6661 sizeof(n->features.hbs), req);
6662 if (status) {
6663 return status;
6664 }
6665
6666 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6667 ns = nvme_ns(n, i);
6668
6669 if (!ns) {
6670 continue;
6671 }
6672
6673 ns->id_ns.nlbaf = ns->nlbaf - 1;
6674 if (!n->features.hbs.lbafee) {
6675 ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
6676 }
6677 }
6678
6679 return status;
6680 case NVME_COMMAND_SET_PROFILE:
6681 if (dw11 & 0x1ff) {
6682 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
6683 return NVME_IOCS_COMBINATION_REJECTED | NVME_DNR;
6684 }
6685 break;
6686 case NVME_FDP_MODE:
6687 /* spec: abort with cmd seq err if there's one or more NS' in endgrp */
6688 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6689 case NVME_FDP_EVENTS:
6690 return nvme_set_feature_fdp_events(n, ns, req);
6691 case NVME_WRITE_ATOMICITY:
6692
6693 n->dn = 0x1 & dw11;
6694
6695 if (n->dn) {
6696 atomic->atomic_max_write_size = le16_to_cpu(id->awupf) + 1;
6697 } else {
6698 atomic->atomic_max_write_size = le16_to_cpu(id->awun) + 1;
6699 }
6700
6701 if (atomic->atomic_max_write_size == 1) {
6702 atomic->atomic_writes = 0;
6703 } else {
6704 atomic->atomic_writes = 1;
6705 }
6706 break;
6707 default:
6708 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6709 }
6710 return NVME_SUCCESS;
6711 }
6712
nvme_aer(NvmeCtrl * n,NvmeRequest * req)6713 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
6714 {
6715 trace_pci_nvme_aer(nvme_cid(req));
6716
6717 if (n->outstanding_aers > n->params.aerl) {
6718 trace_pci_nvme_aer_aerl_exceeded();
6719 return NVME_AER_LIMIT_EXCEEDED;
6720 }
6721
6722 n->aer_reqs[n->outstanding_aers] = req;
6723 n->outstanding_aers++;
6724
6725 if (!QTAILQ_EMPTY(&n->aer_queue)) {
6726 nvme_process_aers(n);
6727 }
6728
6729 return NVME_NO_COMPLETE;
6730 }
6731
nvme_update_dsm_limits(NvmeCtrl * n,NvmeNamespace * ns)6732 static void nvme_update_dsm_limits(NvmeCtrl *n, NvmeNamespace *ns)
6733 {
6734 if (ns) {
6735 n->dmrsl =
6736 MIN_NON_ZERO(n->dmrsl, BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6737
6738 return;
6739 }
6740
6741 for (uint32_t nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
6742 ns = nvme_ns(n, nsid);
6743 if (!ns) {
6744 continue;
6745 }
6746
6747 n->dmrsl =
6748 MIN_NON_ZERO(n->dmrsl, BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6749 }
6750 }
6751
nvme_csi_supported(NvmeCtrl * n,uint8_t csi)6752 static bool nvme_csi_supported(NvmeCtrl *n, uint8_t csi)
6753 {
6754 uint32_t cc;
6755
6756 switch (csi) {
6757 case NVME_CSI_NVM:
6758 return true;
6759
6760 case NVME_CSI_ZONED:
6761 cc = ldl_le_p(&n->bar.cc);
6762
6763 return NVME_CC_CSS(cc) == NVME_CC_CSS_ALL;
6764 }
6765
6766 g_assert_not_reached();
6767 }
6768
nvme_detach_ns(NvmeCtrl * n,NvmeNamespace * ns)6769 static void nvme_detach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6770 {
6771 assert(ns->attached > 0);
6772
6773 n->namespaces[ns->params.nsid] = NULL;
6774 ns->attached--;
6775 }
6776
nvme_ns_attachment(NvmeCtrl * n,NvmeRequest * req)6777 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
6778 {
6779 NvmeNamespace *ns;
6780 NvmeCtrl *ctrl;
6781 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
6782 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6783 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6784 uint8_t sel = dw10 & 0xf;
6785 uint16_t *nr_ids = &list[0];
6786 uint16_t *ids = &list[1];
6787 uint16_t ret;
6788 int i;
6789
6790 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
6791
6792 if (!nvme_nsid_valid(n, nsid)) {
6793 return NVME_INVALID_NSID | NVME_DNR;
6794 }
6795
6796 ns = nvme_subsys_ns(n->subsys, nsid);
6797 if (!ns) {
6798 return NVME_INVALID_FIELD | NVME_DNR;
6799 }
6800
6801 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
6802 if (ret) {
6803 return ret;
6804 }
6805
6806 if (!*nr_ids) {
6807 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6808 }
6809
6810 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
6811 for (i = 0; i < *nr_ids; i++) {
6812 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
6813 if (!ctrl) {
6814 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6815 }
6816
6817 switch (sel) {
6818 case NVME_NS_ATTACHMENT_ATTACH:
6819 if (nvme_ns(ctrl, nsid)) {
6820 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
6821 }
6822
6823 if (ns->attached && !ns->params.shared) {
6824 return NVME_NS_PRIVATE | NVME_DNR;
6825 }
6826
6827 if (!nvme_csi_supported(ctrl, ns->csi)) {
6828 return NVME_IOCS_NOT_SUPPORTED | NVME_DNR;
6829 }
6830
6831 nvme_attach_ns(ctrl, ns);
6832 nvme_update_dsm_limits(ctrl, ns);
6833
6834 break;
6835
6836 case NVME_NS_ATTACHMENT_DETACH:
6837 if (!nvme_ns(ctrl, nsid)) {
6838 return NVME_NS_NOT_ATTACHED | NVME_DNR;
6839 }
6840
6841 nvme_detach_ns(ctrl, ns);
6842 nvme_update_dsm_limits(ctrl, NULL);
6843
6844 break;
6845
6846 default:
6847 return NVME_INVALID_FIELD | NVME_DNR;
6848 }
6849
6850 /*
6851 * Add namespace id to the changed namespace id list for event clearing
6852 * via Get Log Page command.
6853 */
6854 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
6855 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
6856 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
6857 NVME_LOG_CHANGED_NSLIST);
6858 }
6859 }
6860
6861 return NVME_SUCCESS;
6862 }
6863
6864 typedef struct NvmeFormatAIOCB {
6865 BlockAIOCB common;
6866 BlockAIOCB *aiocb;
6867 NvmeRequest *req;
6868 int ret;
6869
6870 NvmeNamespace *ns;
6871 uint32_t nsid;
6872 bool broadcast;
6873 int64_t offset;
6874
6875 uint8_t lbaf;
6876 uint8_t mset;
6877 uint8_t pi;
6878 uint8_t pil;
6879 } NvmeFormatAIOCB;
6880
nvme_format_cancel(BlockAIOCB * aiocb)6881 static void nvme_format_cancel(BlockAIOCB *aiocb)
6882 {
6883 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
6884
6885 iocb->ret = -ECANCELED;
6886
6887 if (iocb->aiocb) {
6888 blk_aio_cancel_async(iocb->aiocb);
6889 iocb->aiocb = NULL;
6890 }
6891 }
6892
6893 static const AIOCBInfo nvme_format_aiocb_info = {
6894 .aiocb_size = sizeof(NvmeFormatAIOCB),
6895 .cancel_async = nvme_format_cancel,
6896 };
6897
nvme_format_set(NvmeNamespace * ns,uint8_t lbaf,uint8_t mset,uint8_t pi,uint8_t pil)6898 static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
6899 uint8_t pi, uint8_t pil)
6900 {
6901 uint8_t lbafl = lbaf & 0xf;
6902 uint8_t lbafu = lbaf >> 4;
6903
6904 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
6905
6906 ns->id_ns.dps = (pil << 3) | pi;
6907 ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
6908
6909 nvme_ns_init_format(ns);
6910 }
6911
6912 static void nvme_do_format(NvmeFormatAIOCB *iocb);
6913
nvme_format_ns_cb(void * opaque,int ret)6914 static void nvme_format_ns_cb(void *opaque, int ret)
6915 {
6916 NvmeFormatAIOCB *iocb = opaque;
6917 NvmeNamespace *ns = iocb->ns;
6918 int bytes;
6919
6920 if (iocb->ret < 0) {
6921 goto done;
6922 } else if (ret < 0) {
6923 iocb->ret = ret;
6924 goto done;
6925 }
6926
6927 assert(ns);
6928
6929 if (iocb->offset < ns->size) {
6930 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
6931
6932 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
6933 bytes, BDRV_REQ_MAY_UNMAP,
6934 nvme_format_ns_cb, iocb);
6935
6936 iocb->offset += bytes;
6937 return;
6938 }
6939
6940 nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
6941 ns->status = 0x0;
6942 iocb->ns = NULL;
6943 iocb->offset = 0;
6944
6945 done:
6946 nvme_do_format(iocb);
6947 }
6948
nvme_format_check(NvmeNamespace * ns,uint8_t lbaf,uint8_t pi)6949 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
6950 {
6951 if (ns->params.zoned) {
6952 return NVME_INVALID_FORMAT | NVME_DNR;
6953 }
6954
6955 if (lbaf > ns->id_ns.nlbaf) {
6956 return NVME_INVALID_FORMAT | NVME_DNR;
6957 }
6958
6959 if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
6960 return NVME_INVALID_FORMAT | NVME_DNR;
6961 }
6962
6963 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
6964 return NVME_INVALID_FIELD | NVME_DNR;
6965 }
6966
6967 return NVME_SUCCESS;
6968 }
6969
nvme_do_format(NvmeFormatAIOCB * iocb)6970 static void nvme_do_format(NvmeFormatAIOCB *iocb)
6971 {
6972 NvmeRequest *req = iocb->req;
6973 NvmeCtrl *n = nvme_ctrl(req);
6974 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6975 uint8_t lbaf = dw10 & 0xf;
6976 uint8_t pi = (dw10 >> 5) & 0x7;
6977 uint16_t status;
6978 int i;
6979
6980 if (iocb->ret < 0) {
6981 goto done;
6982 }
6983
6984 if (iocb->broadcast) {
6985 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
6986 iocb->ns = nvme_ns(n, i);
6987 if (iocb->ns) {
6988 iocb->nsid = i;
6989 break;
6990 }
6991 }
6992 }
6993
6994 if (!iocb->ns) {
6995 goto done;
6996 }
6997
6998 status = nvme_format_check(iocb->ns, lbaf, pi);
6999 if (status) {
7000 req->status = status;
7001 goto done;
7002 }
7003
7004 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
7005 nvme_format_ns_cb(iocb, 0);
7006 return;
7007
7008 done:
7009 iocb->common.cb(iocb->common.opaque, iocb->ret);
7010 qemu_aio_unref(iocb);
7011 }
7012
nvme_format(NvmeCtrl * n,NvmeRequest * req)7013 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
7014 {
7015 NvmeFormatAIOCB *iocb;
7016 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
7017 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7018 uint8_t lbaf = dw10 & 0xf;
7019 uint8_t mset = (dw10 >> 4) & 0x1;
7020 uint8_t pi = (dw10 >> 5) & 0x7;
7021 uint8_t pil = (dw10 >> 8) & 0x1;
7022 uint8_t lbafu = (dw10 >> 12) & 0x3;
7023 uint16_t status;
7024
7025 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
7026
7027 iocb->req = req;
7028 iocb->ret = 0;
7029 iocb->ns = NULL;
7030 iocb->nsid = 0;
7031 iocb->lbaf = lbaf;
7032 iocb->mset = mset;
7033 iocb->pi = pi;
7034 iocb->pil = pil;
7035 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
7036 iocb->offset = 0;
7037
7038 if (n->features.hbs.lbafee) {
7039 iocb->lbaf |= lbafu << 4;
7040 }
7041
7042 if (!iocb->broadcast) {
7043 if (!nvme_nsid_valid(n, nsid)) {
7044 status = NVME_INVALID_NSID | NVME_DNR;
7045 goto out;
7046 }
7047
7048 iocb->ns = nvme_ns(n, nsid);
7049 if (!iocb->ns) {
7050 status = NVME_INVALID_FIELD | NVME_DNR;
7051 goto out;
7052 }
7053 }
7054
7055 req->aiocb = &iocb->common;
7056 nvme_do_format(iocb);
7057
7058 return NVME_NO_COMPLETE;
7059
7060 out:
7061 qemu_aio_unref(iocb);
7062
7063 return status;
7064 }
7065
nvme_get_virt_res_num(NvmeCtrl * n,uint8_t rt,int * num_total,int * num_prim,int * num_sec)7066 static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
7067 int *num_prim, int *num_sec)
7068 {
7069 *num_total = le32_to_cpu(rt ?
7070 n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt);
7071 *num_prim = le16_to_cpu(rt ?
7072 n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap);
7073 *num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
7074 }
7075
nvme_assign_virt_res_to_prim(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)7076 static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
7077 uint16_t cntlid, uint8_t rt,
7078 int nr)
7079 {
7080 int num_total, num_prim, num_sec;
7081
7082 if (cntlid != n->cntlid) {
7083 return NVME_INVALID_CTRL_ID | NVME_DNR;
7084 }
7085
7086 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
7087
7088 if (nr > num_total) {
7089 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
7090 }
7091
7092 if (nr > num_total - num_sec) {
7093 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7094 }
7095
7096 if (rt) {
7097 n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
7098 } else {
7099 n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
7100 }
7101
7102 req->cqe.result = cpu_to_le32(nr);
7103 return req->status;
7104 }
7105
nvme_update_virt_res(NvmeCtrl * n,NvmeSecCtrlEntry * sctrl,uint8_t rt,int nr)7106 static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
7107 uint8_t rt, int nr)
7108 {
7109 int prev_nr, prev_total;
7110
7111 if (rt) {
7112 prev_nr = le16_to_cpu(sctrl->nvi);
7113 prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
7114 sctrl->nvi = cpu_to_le16(nr);
7115 n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
7116 } else {
7117 prev_nr = le16_to_cpu(sctrl->nvq);
7118 prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
7119 sctrl->nvq = cpu_to_le16(nr);
7120 n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
7121 }
7122 }
7123
nvme_assign_virt_res_to_sec(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)7124 static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
7125 uint16_t cntlid, uint8_t rt, int nr)
7126 {
7127 int num_total, num_prim, num_sec, num_free, diff, limit;
7128 NvmeSecCtrlEntry *sctrl;
7129
7130 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
7131 if (!sctrl) {
7132 return NVME_INVALID_CTRL_ID | NVME_DNR;
7133 }
7134
7135 if (sctrl->scs) {
7136 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
7137 }
7138
7139 limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
7140 if (nr > limit) {
7141 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
7142 }
7143
7144 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
7145 num_free = num_total - num_prim - num_sec;
7146 diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
7147
7148 if (diff > num_free) {
7149 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7150 }
7151
7152 nvme_update_virt_res(n, sctrl, rt, nr);
7153 req->cqe.result = cpu_to_le32(nr);
7154
7155 return req->status;
7156 }
7157
nvme_virt_set_state(NvmeCtrl * n,uint16_t cntlid,bool online)7158 static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
7159 {
7160 PCIDevice *pci = PCI_DEVICE(n);
7161 NvmeCtrl *sn = NULL;
7162 NvmeSecCtrlEntry *sctrl;
7163 int vf_index;
7164
7165 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
7166 if (!sctrl) {
7167 return NVME_INVALID_CTRL_ID | NVME_DNR;
7168 }
7169
7170 if (!pci_is_vf(pci)) {
7171 vf_index = le16_to_cpu(sctrl->vfn) - 1;
7172 sn = NVME(pcie_sriov_get_vf_at_index(pci, vf_index));
7173 }
7174
7175 if (online) {
7176 if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
7177 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
7178 }
7179
7180 if (!sctrl->scs) {
7181 sctrl->scs = 0x1;
7182 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7183 }
7184 } else {
7185 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_INTERRUPT, 0);
7186 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_QUEUE, 0);
7187
7188 if (sctrl->scs) {
7189 sctrl->scs = 0x0;
7190 if (sn) {
7191 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7192 }
7193 }
7194 }
7195
7196 return NVME_SUCCESS;
7197 }
7198
nvme_virt_mngmt(NvmeCtrl * n,NvmeRequest * req)7199 static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
7200 {
7201 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7202 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7203 uint8_t act = dw10 & 0xf;
7204 uint8_t rt = (dw10 >> 8) & 0x7;
7205 uint16_t cntlid = (dw10 >> 16) & 0xffff;
7206 int nr = dw11 & 0xffff;
7207
7208 trace_pci_nvme_virt_mngmt(nvme_cid(req), act, cntlid, rt ? "VI" : "VQ", nr);
7209
7210 if (rt != NVME_VIRT_RES_QUEUE && rt != NVME_VIRT_RES_INTERRUPT) {
7211 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7212 }
7213
7214 switch (act) {
7215 case NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN:
7216 return nvme_assign_virt_res_to_sec(n, req, cntlid, rt, nr);
7217 case NVME_VIRT_MNGMT_ACTION_PRM_ALLOC:
7218 return nvme_assign_virt_res_to_prim(n, req, cntlid, rt, nr);
7219 case NVME_VIRT_MNGMT_ACTION_SEC_ONLINE:
7220 return nvme_virt_set_state(n, cntlid, true);
7221 case NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE:
7222 return nvme_virt_set_state(n, cntlid, false);
7223 default:
7224 return NVME_INVALID_FIELD | NVME_DNR;
7225 }
7226 }
7227
nvme_dbbuf_config(NvmeCtrl * n,const NvmeRequest * req)7228 static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
7229 {
7230 PCIDevice *pci = PCI_DEVICE(n);
7231 uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
7232 uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
7233 int i;
7234
7235 /* Address should be page aligned */
7236 if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
7237 return NVME_INVALID_FIELD | NVME_DNR;
7238 }
7239
7240 /* Save shadow buffer base addr for use during queue creation */
7241 n->dbbuf_dbs = dbs_addr;
7242 n->dbbuf_eis = eis_addr;
7243 n->dbbuf_enabled = true;
7244
7245 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7246 NvmeSQueue *sq = n->sq[i];
7247 NvmeCQueue *cq = n->cq[i];
7248
7249 if (sq) {
7250 /*
7251 * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
7252 * nvme_process_db() uses this hard-coded way to calculate
7253 * doorbell offsets. Be consistent with that here.
7254 */
7255 sq->db_addr = dbs_addr + (i << 3);
7256 sq->ei_addr = eis_addr + (i << 3);
7257 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
7258
7259 if (n->params.ioeventfd && sq->sqid != 0) {
7260 if (!nvme_init_sq_ioeventfd(sq)) {
7261 sq->ioeventfd_enabled = true;
7262 }
7263 }
7264 }
7265
7266 if (cq) {
7267 /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
7268 cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
7269 cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
7270 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
7271
7272 if (n->params.ioeventfd && cq->cqid != 0) {
7273 if (!nvme_init_cq_ioeventfd(cq)) {
7274 cq->ioeventfd_enabled = true;
7275 }
7276 }
7277 }
7278 }
7279
7280 trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr);
7281
7282 return NVME_SUCCESS;
7283 }
7284
nvme_directive_send(NvmeCtrl * n,NvmeRequest * req)7285 static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req)
7286 {
7287 return NVME_INVALID_FIELD | NVME_DNR;
7288 }
7289
nvme_directive_receive(NvmeCtrl * n,NvmeRequest * req)7290 static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
7291 {
7292 NvmeNamespace *ns;
7293 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7294 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7295 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
7296 uint8_t doper, dtype;
7297 uint32_t numd, trans_len;
7298 NvmeDirectiveIdentify id = {
7299 .supported = 1 << NVME_DIRECTIVE_IDENTIFY,
7300 .enabled = 1 << NVME_DIRECTIVE_IDENTIFY,
7301 };
7302
7303 numd = dw10 + 1;
7304 doper = dw11 & 0xff;
7305 dtype = (dw11 >> 8) & 0xff;
7306
7307 trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2);
7308
7309 if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY ||
7310 doper != NVME_DIRECTIVE_RETURN_PARAMS) {
7311 return NVME_INVALID_FIELD | NVME_DNR;
7312 }
7313
7314 ns = nvme_ns(n, nsid);
7315 if (!ns) {
7316 return NVME_INVALID_FIELD | NVME_DNR;
7317 }
7318
7319 switch (dtype) {
7320 case NVME_DIRECTIVE_IDENTIFY:
7321 switch (doper) {
7322 case NVME_DIRECTIVE_RETURN_PARAMS:
7323 if (ns->endgrp && ns->endgrp->fdp.enabled) {
7324 id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7325 id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7326 id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7327 }
7328
7329 return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
7330
7331 default:
7332 return NVME_INVALID_FIELD | NVME_DNR;
7333 }
7334
7335 default:
7336 return NVME_INVALID_FIELD;
7337 }
7338 }
7339
nvme_admin_cmd(NvmeCtrl * n,NvmeRequest * req)7340 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
7341 {
7342 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
7343 nvme_adm_opc_str(req->cmd.opcode));
7344
7345 if (!(n->cse.acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
7346 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
7347 return NVME_INVALID_OPCODE | NVME_DNR;
7348 }
7349
7350 /* SGLs shall not be used for Admin commands in NVMe over PCIe */
7351 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
7352 return NVME_INVALID_FIELD | NVME_DNR;
7353 }
7354
7355 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
7356 return NVME_INVALID_FIELD;
7357 }
7358
7359 switch (req->cmd.opcode) {
7360 case NVME_ADM_CMD_DELETE_SQ:
7361 return nvme_del_sq(n, req);
7362 case NVME_ADM_CMD_CREATE_SQ:
7363 return nvme_create_sq(n, req);
7364 case NVME_ADM_CMD_GET_LOG_PAGE:
7365 return nvme_get_log(n, req);
7366 case NVME_ADM_CMD_DELETE_CQ:
7367 return nvme_del_cq(n, req);
7368 case NVME_ADM_CMD_CREATE_CQ:
7369 return nvme_create_cq(n, req);
7370 case NVME_ADM_CMD_IDENTIFY:
7371 return nvme_identify(n, req);
7372 case NVME_ADM_CMD_ABORT:
7373 return nvme_abort(n, req);
7374 case NVME_ADM_CMD_SET_FEATURES:
7375 return nvme_set_feature(n, req);
7376 case NVME_ADM_CMD_GET_FEATURES:
7377 return nvme_get_feature(n, req);
7378 case NVME_ADM_CMD_ASYNC_EV_REQ:
7379 return nvme_aer(n, req);
7380 case NVME_ADM_CMD_NS_ATTACHMENT:
7381 return nvme_ns_attachment(n, req);
7382 case NVME_ADM_CMD_VIRT_MNGMT:
7383 return nvme_virt_mngmt(n, req);
7384 case NVME_ADM_CMD_DBBUF_CONFIG:
7385 return nvme_dbbuf_config(n, req);
7386 case NVME_ADM_CMD_FORMAT_NVM:
7387 return nvme_format(n, req);
7388 case NVME_ADM_CMD_DIRECTIVE_SEND:
7389 return nvme_directive_send(n, req);
7390 case NVME_ADM_CMD_DIRECTIVE_RECV:
7391 return nvme_directive_receive(n, req);
7392 default:
7393 g_assert_not_reached();
7394 }
7395
7396 return NVME_INVALID_OPCODE | NVME_DNR;
7397 }
7398
nvme_update_sq_eventidx(const NvmeSQueue * sq)7399 static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
7400 {
7401 trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail);
7402
7403 stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail,
7404 MEMTXATTRS_UNSPECIFIED);
7405 }
7406
nvme_update_sq_tail(NvmeSQueue * sq)7407 static void nvme_update_sq_tail(NvmeSQueue *sq)
7408 {
7409 ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, &sq->tail,
7410 MEMTXATTRS_UNSPECIFIED);
7411
7412 trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail);
7413 }
7414
7415 #define NVME_ATOMIC_NO_START 0
7416 #define NVME_ATOMIC_START_ATOMIC 1
7417 #define NVME_ATOMIC_START_NONATOMIC 2
7418
nvme_atomic_write_check(NvmeCtrl * n,NvmeCmd * cmd,NvmeAtomic * atomic)7419 static int nvme_atomic_write_check(NvmeCtrl *n, NvmeCmd *cmd,
7420 NvmeAtomic *atomic)
7421 {
7422 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
7423 uint64_t slba = le64_to_cpu(rw->slba);
7424 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb);
7425 uint64_t elba = slba + nlb;
7426 bool cmd_atomic_wr = true;
7427 int i;
7428
7429 if ((cmd->opcode == NVME_CMD_READ) || ((cmd->opcode == NVME_CMD_WRITE) &&
7430 ((rw->nlb + 1) > atomic->atomic_max_write_size))) {
7431 cmd_atomic_wr = false;
7432 }
7433
7434 /*
7435 * Walk the queues to see if there are any atomic conflicts.
7436 */
7437 for (i = 1; i < n->params.max_ioqpairs + 1; i++) {
7438 NvmeSQueue *sq;
7439 NvmeRequest *req;
7440 NvmeRwCmd *req_rw;
7441 uint64_t req_slba;
7442 uint32_t req_nlb;
7443 uint64_t req_elba;
7444
7445 sq = n->sq[i];
7446 if (!sq) {
7447 continue;
7448 }
7449
7450 /*
7451 * Walk all the requests on a given queue.
7452 */
7453 QTAILQ_FOREACH(req, &sq->out_req_list, entry) {
7454 req_rw = (NvmeRwCmd *)&req->cmd;
7455
7456 if (((req_rw->opcode == NVME_CMD_WRITE) ||
7457 (req_rw->opcode == NVME_CMD_READ)) &&
7458 (cmd->nsid == req->ns->params.nsid)) {
7459 req_slba = le64_to_cpu(req_rw->slba);
7460 req_nlb = (uint32_t)le16_to_cpu(req_rw->nlb);
7461 req_elba = req_slba + req_nlb;
7462
7463 if (cmd_atomic_wr) {
7464 if ((elba >= req_slba) && (slba <= req_elba)) {
7465 return NVME_ATOMIC_NO_START;
7466 }
7467 } else {
7468 if (req->atomic_write && ((elba >= req_slba) &&
7469 (slba <= req_elba))) {
7470 return NVME_ATOMIC_NO_START;
7471 }
7472 }
7473 }
7474 }
7475 }
7476 if (cmd_atomic_wr) {
7477 return NVME_ATOMIC_START_ATOMIC;
7478 }
7479 return NVME_ATOMIC_START_NONATOMIC;
7480 }
7481
nvme_get_atomic(NvmeCtrl * n,NvmeCmd * cmd)7482 static NvmeAtomic *nvme_get_atomic(NvmeCtrl *n, NvmeCmd *cmd)
7483 {
7484 if (n->atomic.atomic_writes) {
7485 return &n->atomic;
7486 }
7487 return NULL;
7488 }
7489
nvme_process_sq(void * opaque)7490 static void nvme_process_sq(void *opaque)
7491 {
7492 NvmeSQueue *sq = opaque;
7493 NvmeCtrl *n = sq->ctrl;
7494 NvmeCQueue *cq = n->cq[sq->cqid];
7495
7496 uint16_t status;
7497 hwaddr addr;
7498 NvmeCmd cmd;
7499 NvmeRequest *req;
7500
7501 if (n->dbbuf_enabled) {
7502 nvme_update_sq_tail(sq);
7503 }
7504
7505 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
7506 NvmeAtomic *atomic;
7507 bool cmd_is_atomic;
7508
7509 addr = sq->dma_addr + (sq->head << NVME_SQES);
7510 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
7511 trace_pci_nvme_err_addr_read(addr);
7512 trace_pci_nvme_err_cfs();
7513 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
7514 break;
7515 }
7516
7517 atomic = nvme_get_atomic(n, &cmd);
7518
7519 cmd_is_atomic = false;
7520 if (sq->sqid && atomic) {
7521 int ret;
7522
7523 ret = nvme_atomic_write_check(n, &cmd, atomic);
7524 switch (ret) {
7525 case NVME_ATOMIC_NO_START:
7526 qemu_bh_schedule(sq->bh);
7527 return;
7528 case NVME_ATOMIC_START_ATOMIC:
7529 cmd_is_atomic = true;
7530 break;
7531 case NVME_ATOMIC_START_NONATOMIC:
7532 default:
7533 break;
7534 }
7535 }
7536 nvme_inc_sq_head(sq);
7537
7538 req = QTAILQ_FIRST(&sq->req_list);
7539 QTAILQ_REMOVE(&sq->req_list, req, entry);
7540 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
7541 nvme_req_clear(req);
7542 req->cqe.cid = cmd.cid;
7543 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
7544
7545 if (sq->sqid && atomic) {
7546 req->atomic_write = cmd_is_atomic;
7547 }
7548
7549 status = sq->sqid ? nvme_io_cmd(n, req) :
7550 nvme_admin_cmd(n, req);
7551 if (status != NVME_NO_COMPLETE) {
7552 req->status = status;
7553 nvme_enqueue_req_completion(cq, req);
7554 }
7555
7556 if (n->dbbuf_enabled) {
7557 nvme_update_sq_eventidx(sq);
7558 nvme_update_sq_tail(sq);
7559 }
7560 }
7561 }
7562
nvme_update_msixcap_ts(PCIDevice * pci_dev,uint32_t table_size)7563 static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size)
7564 {
7565 uint8_t *config;
7566
7567 if (!msix_present(pci_dev)) {
7568 return;
7569 }
7570
7571 assert(table_size > 0 && table_size <= pci_dev->msix_entries_nr);
7572
7573 config = pci_dev->config + pci_dev->msix_cap;
7574 pci_set_word_by_mask(config + PCI_MSIX_FLAGS, PCI_MSIX_FLAGS_QSIZE,
7575 table_size - 1);
7576 }
7577
nvme_activate_virt_res(NvmeCtrl * n)7578 static void nvme_activate_virt_res(NvmeCtrl *n)
7579 {
7580 PCIDevice *pci_dev = PCI_DEVICE(n);
7581 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7582 NvmeSecCtrlEntry *sctrl;
7583
7584 /* -1 to account for the admin queue */
7585 if (pci_is_vf(pci_dev)) {
7586 sctrl = nvme_sctrl(n);
7587 cap->vqprt = sctrl->nvq;
7588 cap->viprt = sctrl->nvi;
7589 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
7590 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
7591 } else {
7592 cap->vqrfap = n->next_pri_ctrl_cap.vqrfap;
7593 cap->virfap = n->next_pri_ctrl_cap.virfap;
7594 n->conf_ioqpairs = le16_to_cpu(cap->vqprt) +
7595 le16_to_cpu(cap->vqrfap) - 1;
7596 n->conf_msix_qsize = le16_to_cpu(cap->viprt) +
7597 le16_to_cpu(cap->virfap);
7598 }
7599 }
7600
nvme_ctrl_reset(NvmeCtrl * n,NvmeResetType rst)7601 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
7602 {
7603 PCIDevice *pci_dev = PCI_DEVICE(n);
7604 NvmeSecCtrlEntry *sctrl;
7605 NvmeNamespace *ns;
7606 int i;
7607
7608 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7609 ns = nvme_ns(n, i);
7610 if (!ns) {
7611 continue;
7612 }
7613
7614 nvme_ns_drain(ns);
7615 }
7616
7617 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7618 if (n->sq[i] != NULL) {
7619 nvme_free_sq(n->sq[i], n);
7620 }
7621 }
7622 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7623 if (n->cq[i] != NULL) {
7624 nvme_free_cq(n->cq[i], n);
7625 }
7626 }
7627
7628 while (!QTAILQ_EMPTY(&n->aer_queue)) {
7629 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
7630 QTAILQ_REMOVE(&n->aer_queue, event, entry);
7631 g_free(event);
7632 }
7633
7634 if (n->params.sriov_max_vfs) {
7635 if (!pci_is_vf(pci_dev)) {
7636 for (i = 0; i < n->nr_sec_ctrls; i++) {
7637 sctrl = &n->sec_ctrl_list[i];
7638 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
7639 }
7640 }
7641
7642 if (rst != NVME_RESET_CONTROLLER) {
7643 nvme_activate_virt_res(n);
7644 }
7645 }
7646
7647 n->aer_queued = 0;
7648 n->aer_mask = 0;
7649 n->outstanding_aers = 0;
7650 n->qs_created = false;
7651
7652 n->dn = n->params.atomic_dn; /* Set Disable Normal */
7653
7654 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
7655
7656 if (pci_is_vf(pci_dev)) {
7657 sctrl = nvme_sctrl(n);
7658
7659 stl_le_p(&n->bar.csts, sctrl->scs ? 0 : NVME_CSTS_FAILED);
7660 } else {
7661 stl_le_p(&n->bar.csts, 0);
7662 }
7663
7664 stl_le_p(&n->bar.intms, 0);
7665 stl_le_p(&n->bar.intmc, 0);
7666 stl_le_p(&n->bar.cc, 0);
7667
7668 n->dbbuf_dbs = 0;
7669 n->dbbuf_eis = 0;
7670 n->dbbuf_enabled = false;
7671 }
7672
nvme_ctrl_shutdown(NvmeCtrl * n)7673 static void nvme_ctrl_shutdown(NvmeCtrl *n)
7674 {
7675 NvmeNamespace *ns;
7676 int i;
7677
7678 if (n->pmr.dev) {
7679 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7680 }
7681
7682 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7683 ns = nvme_ns(n, i);
7684 if (!ns) {
7685 continue;
7686 }
7687
7688 nvme_ns_shutdown(ns);
7689 }
7690 }
7691
nvme_start_ctrl(NvmeCtrl * n)7692 static int nvme_start_ctrl(NvmeCtrl *n)
7693 {
7694 uint64_t cap = ldq_le_p(&n->bar.cap);
7695 uint32_t cc = ldl_le_p(&n->bar.cc);
7696 uint32_t aqa = ldl_le_p(&n->bar.aqa);
7697 uint64_t asq = ldq_le_p(&n->bar.asq);
7698 uint64_t acq = ldq_le_p(&n->bar.acq);
7699 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
7700 uint32_t page_size = 1 << page_bits;
7701 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
7702
7703 if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) {
7704 trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
7705 le16_to_cpu(sctrl->nvq));
7706 return -1;
7707 }
7708 if (unlikely(n->cq[0])) {
7709 trace_pci_nvme_err_startfail_cq();
7710 return -1;
7711 }
7712 if (unlikely(n->sq[0])) {
7713 trace_pci_nvme_err_startfail_sq();
7714 return -1;
7715 }
7716 if (unlikely(asq & (page_size - 1))) {
7717 trace_pci_nvme_err_startfail_asq_misaligned(asq);
7718 return -1;
7719 }
7720 if (unlikely(acq & (page_size - 1))) {
7721 trace_pci_nvme_err_startfail_acq_misaligned(acq);
7722 return -1;
7723 }
7724 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
7725 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
7726 return -1;
7727 }
7728 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
7729 trace_pci_nvme_err_startfail_page_too_small(
7730 NVME_CC_MPS(cc),
7731 NVME_CAP_MPSMIN(cap));
7732 return -1;
7733 }
7734 if (unlikely(NVME_CC_MPS(cc) >
7735 NVME_CAP_MPSMAX(cap))) {
7736 trace_pci_nvme_err_startfail_page_too_large(
7737 NVME_CC_MPS(cc),
7738 NVME_CAP_MPSMAX(cap));
7739 return -1;
7740 }
7741 if (unlikely(!NVME_AQA_ASQS(aqa))) {
7742 trace_pci_nvme_err_startfail_asqent_sz_zero();
7743 return -1;
7744 }
7745 if (unlikely(!NVME_AQA_ACQS(aqa))) {
7746 trace_pci_nvme_err_startfail_acqent_sz_zero();
7747 return -1;
7748 }
7749
7750 n->page_bits = page_bits;
7751 n->page_size = page_size;
7752 n->max_prp_ents = n->page_size / sizeof(uint64_t);
7753 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
7754 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
7755
7756 nvme_set_timestamp(n, 0ULL);
7757
7758 /* verify that the command sets of attached namespaces are supported */
7759 for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7760 NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
7761
7762 if (!ns || (!ns->params.shared && ns->ctrl != n)) {
7763 continue;
7764 }
7765
7766 if (nvme_csi_supported(n, ns->csi) && !ns->params.detached) {
7767 if (!ns->attached || ns->params.shared) {
7768 nvme_attach_ns(n, ns);
7769 }
7770 }
7771 }
7772
7773 nvme_update_dsm_limits(n, NULL);
7774
7775 return 0;
7776 }
7777
nvme_cmb_enable_regs(NvmeCtrl * n)7778 static void nvme_cmb_enable_regs(NvmeCtrl *n)
7779 {
7780 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
7781 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
7782
7783 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
7784 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
7785 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
7786 stl_le_p(&n->bar.cmbloc, cmbloc);
7787
7788 NVME_CMBSZ_SET_SQS(cmbsz, 1);
7789 NVME_CMBSZ_SET_CQS(cmbsz, 0);
7790 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
7791 NVME_CMBSZ_SET_RDS(cmbsz, 1);
7792 NVME_CMBSZ_SET_WDS(cmbsz, 1);
7793 NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
7794 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
7795 stl_le_p(&n->bar.cmbsz, cmbsz);
7796 }
7797
nvme_write_bar(NvmeCtrl * n,hwaddr offset,uint64_t data,unsigned size)7798 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
7799 unsigned size)
7800 {
7801 PCIDevice *pci = PCI_DEVICE(n);
7802 uint64_t cap = ldq_le_p(&n->bar.cap);
7803 uint32_t cc = ldl_le_p(&n->bar.cc);
7804 uint32_t intms = ldl_le_p(&n->bar.intms);
7805 uint32_t csts = ldl_le_p(&n->bar.csts);
7806 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
7807
7808 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
7809 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
7810 "MMIO write not 32-bit aligned,"
7811 " offset=0x%"PRIx64"", offset);
7812 /* should be ignored, fall through for now */
7813 }
7814
7815 if (unlikely(size < sizeof(uint32_t))) {
7816 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
7817 "MMIO write smaller than 32-bits,"
7818 " offset=0x%"PRIx64", size=%u",
7819 offset, size);
7820 /* should be ignored, fall through for now */
7821 }
7822
7823 switch (offset) {
7824 case NVME_REG_INTMS:
7825 if (unlikely(msix_enabled(pci))) {
7826 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7827 "undefined access to interrupt mask set"
7828 " when MSI-X is enabled");
7829 /* should be ignored, fall through for now */
7830 }
7831 intms |= data;
7832 stl_le_p(&n->bar.intms, intms);
7833 n->bar.intmc = n->bar.intms;
7834 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
7835 nvme_irq_check(n);
7836 break;
7837 case NVME_REG_INTMC:
7838 if (unlikely(msix_enabled(pci))) {
7839 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7840 "undefined access to interrupt mask clr"
7841 " when MSI-X is enabled");
7842 /* should be ignored, fall through for now */
7843 }
7844 intms &= ~data;
7845 stl_le_p(&n->bar.intms, intms);
7846 n->bar.intmc = n->bar.intms;
7847 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
7848 nvme_irq_check(n);
7849 break;
7850 case NVME_REG_CC:
7851 stl_le_p(&n->bar.cc, data);
7852
7853 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
7854
7855 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
7856 trace_pci_nvme_mmio_shutdown_set();
7857 nvme_ctrl_shutdown(n);
7858 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7859 csts |= NVME_CSTS_SHST_COMPLETE;
7860 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
7861 trace_pci_nvme_mmio_shutdown_cleared();
7862 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7863 }
7864
7865 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
7866 if (unlikely(nvme_start_ctrl(n))) {
7867 trace_pci_nvme_err_startfail();
7868 csts = NVME_CSTS_FAILED;
7869 } else {
7870 trace_pci_nvme_mmio_start_success();
7871 csts = NVME_CSTS_READY;
7872 }
7873 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
7874 trace_pci_nvme_mmio_stopped();
7875 nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
7876
7877 break;
7878 }
7879
7880 stl_le_p(&n->bar.csts, csts);
7881
7882 break;
7883 case NVME_REG_CSTS:
7884 if (data & (1 << 4)) {
7885 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
7886 "attempted to W1C CSTS.NSSRO"
7887 " but CAP.NSSRS is zero (not supported)");
7888 } else if (data != 0) {
7889 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
7890 "attempted to set a read only bit"
7891 " of controller status");
7892 }
7893 break;
7894 case NVME_REG_NSSR:
7895 if (data == 0x4e564d65) {
7896 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
7897 } else {
7898 /* The spec says that writes of other values have no effect */
7899 return;
7900 }
7901 break;
7902 case NVME_REG_AQA:
7903 stl_le_p(&n->bar.aqa, data);
7904 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
7905 break;
7906 case NVME_REG_ASQ:
7907 stn_le_p(&n->bar.asq, size, data);
7908 trace_pci_nvme_mmio_asqaddr(data);
7909 break;
7910 case NVME_REG_ASQ + 4:
7911 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
7912 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
7913 break;
7914 case NVME_REG_ACQ:
7915 trace_pci_nvme_mmio_acqaddr(data);
7916 stn_le_p(&n->bar.acq, size, data);
7917 break;
7918 case NVME_REG_ACQ + 4:
7919 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
7920 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
7921 break;
7922 case NVME_REG_CMBLOC:
7923 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
7924 "invalid write to reserved CMBLOC"
7925 " when CMBSZ is zero, ignored");
7926 return;
7927 case NVME_REG_CMBSZ:
7928 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
7929 "invalid write to read only CMBSZ, ignored");
7930 return;
7931 case NVME_REG_CMBMSC:
7932 if (!NVME_CAP_CMBS(cap)) {
7933 return;
7934 }
7935
7936 stn_le_p(&n->bar.cmbmsc, size, data);
7937 n->cmb.cmse = false;
7938
7939 if (NVME_CMBMSC_CRE(data)) {
7940 nvme_cmb_enable_regs(n);
7941
7942 if (NVME_CMBMSC_CMSE(data)) {
7943 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
7944 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
7945 if (cba + int128_get64(n->cmb.mem.size) < cba) {
7946 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
7947 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
7948 stl_le_p(&n->bar.cmbsts, cmbsts);
7949 return;
7950 }
7951
7952 n->cmb.cba = cba;
7953 n->cmb.cmse = true;
7954 }
7955 } else {
7956 n->bar.cmbsz = 0;
7957 n->bar.cmbloc = 0;
7958 }
7959
7960 return;
7961 case NVME_REG_CMBMSC + 4:
7962 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
7963 return;
7964
7965 case NVME_REG_PMRCAP:
7966 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
7967 "invalid write to PMRCAP register, ignored");
7968 return;
7969 case NVME_REG_PMRCTL:
7970 if (!NVME_CAP_PMRS(cap)) {
7971 return;
7972 }
7973
7974 stl_le_p(&n->bar.pmrctl, data);
7975 if (NVME_PMRCTL_EN(data)) {
7976 memory_region_set_enabled(&n->pmr.dev->mr, true);
7977 pmrsts = 0;
7978 } else {
7979 memory_region_set_enabled(&n->pmr.dev->mr, false);
7980 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
7981 n->pmr.cmse = false;
7982 }
7983 stl_le_p(&n->bar.pmrsts, pmrsts);
7984 return;
7985 case NVME_REG_PMRSTS:
7986 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
7987 "invalid write to PMRSTS register, ignored");
7988 return;
7989 case NVME_REG_PMREBS:
7990 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
7991 "invalid write to PMREBS register, ignored");
7992 return;
7993 case NVME_REG_PMRSWTP:
7994 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
7995 "invalid write to PMRSWTP register, ignored");
7996 return;
7997 case NVME_REG_PMRMSCL:
7998 if (!NVME_CAP_PMRS(cap)) {
7999 return;
8000 }
8001
8002 stl_le_p(&n->bar.pmrmscl, data);
8003 n->pmr.cmse = false;
8004
8005 if (NVME_PMRMSCL_CMSE(data)) {
8006 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
8007 hwaddr cba = pmrmscu << 32 |
8008 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
8009 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
8010 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
8011 stl_le_p(&n->bar.pmrsts, pmrsts);
8012 return;
8013 }
8014
8015 n->pmr.cmse = true;
8016 n->pmr.cba = cba;
8017 }
8018
8019 return;
8020 case NVME_REG_PMRMSCU:
8021 if (!NVME_CAP_PMRS(cap)) {
8022 return;
8023 }
8024
8025 stl_le_p(&n->bar.pmrmscu, data);
8026 return;
8027 default:
8028 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
8029 "invalid MMIO write,"
8030 " offset=0x%"PRIx64", data=%"PRIx64"",
8031 offset, data);
8032 break;
8033 }
8034 }
8035
nvme_mmio_read(void * opaque,hwaddr addr,unsigned size)8036 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
8037 {
8038 NvmeCtrl *n = (NvmeCtrl *)opaque;
8039 uint8_t *ptr = (uint8_t *)&n->bar;
8040
8041 trace_pci_nvme_mmio_read(addr, size);
8042
8043 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
8044 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
8045 "MMIO read not 32-bit aligned,"
8046 " offset=0x%"PRIx64"", addr);
8047 /* should RAZ, fall through for now */
8048 } else if (unlikely(size < sizeof(uint32_t))) {
8049 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
8050 "MMIO read smaller than 32-bits,"
8051 " offset=0x%"PRIx64"", addr);
8052 /* should RAZ, fall through for now */
8053 }
8054
8055 if (addr > sizeof(n->bar) - size) {
8056 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
8057 "MMIO read beyond last register,"
8058 " offset=0x%"PRIx64", returning 0", addr);
8059
8060 return 0;
8061 }
8062
8063 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
8064 addr != NVME_REG_CSTS) {
8065 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
8066 return 0;
8067 }
8068
8069 /*
8070 * When PMRWBM bit 1 is set then read from
8071 * from PMRSTS should ensure prior writes
8072 * made it to persistent media
8073 */
8074 if (addr == NVME_REG_PMRSTS &&
8075 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
8076 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
8077 }
8078
8079 return ldn_le_p(ptr + addr, size);
8080 }
8081
nvme_process_db(NvmeCtrl * n,hwaddr addr,int val)8082 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
8083 {
8084 PCIDevice *pci = PCI_DEVICE(n);
8085 uint32_t qid;
8086
8087 if (unlikely(addr & ((1 << 2) - 1))) {
8088 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
8089 "doorbell write not 32-bit aligned,"
8090 " offset=0x%"PRIx64", ignoring", addr);
8091 return;
8092 }
8093
8094 if (((addr - 0x1000) >> 2) & 1) {
8095 /* Completion queue doorbell write */
8096
8097 uint16_t new_head = val & 0xffff;
8098 NvmeCQueue *cq;
8099
8100 qid = (addr - (0x1000 + (1 << 2))) >> 3;
8101 if (unlikely(nvme_check_cqid(n, qid))) {
8102 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
8103 "completion queue doorbell write"
8104 " for nonexistent queue,"
8105 " sqid=%"PRIu32", ignoring", qid);
8106
8107 /*
8108 * NVM Express v1.3d, Section 4.1 state: "If host software writes
8109 * an invalid value to the Submission Queue Tail Doorbell or
8110 * Completion Queue Head Doorbell register and an Asynchronous Event
8111 * Request command is outstanding, then an asynchronous event is
8112 * posted to the Admin Completion Queue with a status code of
8113 * Invalid Doorbell Write Value."
8114 *
8115 * Also note that the spec includes the "Invalid Doorbell Register"
8116 * status code, but nowhere does it specify when to use it.
8117 * However, it seems reasonable to use it here in a similar
8118 * fashion.
8119 */
8120 if (n->outstanding_aers) {
8121 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8122 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
8123 NVME_LOG_ERROR_INFO);
8124 }
8125
8126 return;
8127 }
8128
8129 cq = n->cq[qid];
8130 if (unlikely(new_head >= cq->size)) {
8131 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
8132 "completion queue doorbell write value"
8133 " beyond queue size, sqid=%"PRIu32","
8134 " new_head=%"PRIu16", ignoring",
8135 qid, new_head);
8136
8137 if (n->outstanding_aers) {
8138 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8139 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
8140 NVME_LOG_ERROR_INFO);
8141 }
8142
8143 return;
8144 }
8145
8146 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
8147
8148 /* scheduled deferred cqe posting if queue was previously full */
8149 if (nvme_cq_full(cq)) {
8150 qemu_bh_schedule(cq->bh);
8151 }
8152
8153 cq->head = new_head;
8154 if (!qid && n->dbbuf_enabled) {
8155 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
8156 }
8157
8158 if (cq->tail == cq->head) {
8159 if (cq->irq_enabled) {
8160 n->cq_pending--;
8161 }
8162
8163 nvme_irq_deassert(n, cq);
8164 }
8165 } else {
8166 /* Submission queue doorbell write */
8167
8168 uint16_t new_tail = val & 0xffff;
8169 NvmeSQueue *sq;
8170
8171 qid = (addr - 0x1000) >> 3;
8172 if (unlikely(nvme_check_sqid(n, qid))) {
8173 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
8174 "submission queue doorbell write"
8175 " for nonexistent queue,"
8176 " sqid=%"PRIu32", ignoring", qid);
8177
8178 if (n->outstanding_aers) {
8179 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8180 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
8181 NVME_LOG_ERROR_INFO);
8182 }
8183
8184 return;
8185 }
8186
8187 sq = n->sq[qid];
8188 if (unlikely(new_tail >= sq->size)) {
8189 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
8190 "submission queue doorbell write value"
8191 " beyond queue size, sqid=%"PRIu32","
8192 " new_tail=%"PRIu16", ignoring",
8193 qid, new_tail);
8194
8195 if (n->outstanding_aers) {
8196 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8197 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
8198 NVME_LOG_ERROR_INFO);
8199 }
8200
8201 return;
8202 }
8203
8204 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
8205
8206 sq->tail = new_tail;
8207 if (!qid && n->dbbuf_enabled) {
8208 /*
8209 * The spec states "the host shall also update the controller's
8210 * corresponding doorbell property to match the value of that entry
8211 * in the Shadow Doorbell buffer."
8212 *
8213 * Since this context is currently a VM trap, we can safely enforce
8214 * the requirement from the device side in case the host is
8215 * misbehaving.
8216 *
8217 * Note, we shouldn't have to do this, but various drivers
8218 * including ones that run on Linux, are not updating Admin Queues,
8219 * so we can't trust reading it for an appropriate sq tail.
8220 */
8221 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
8222 }
8223
8224 qemu_bh_schedule(sq->bh);
8225 }
8226 }
8227
nvme_mmio_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)8228 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
8229 unsigned size)
8230 {
8231 NvmeCtrl *n = (NvmeCtrl *)opaque;
8232
8233 trace_pci_nvme_mmio_write(addr, data, size);
8234
8235 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
8236 addr != NVME_REG_CSTS) {
8237 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
8238 return;
8239 }
8240
8241 if (addr < sizeof(n->bar)) {
8242 nvme_write_bar(n, addr, data, size);
8243 } else {
8244 nvme_process_db(n, addr, data);
8245 }
8246 }
8247
8248 static const MemoryRegionOps nvme_mmio_ops = {
8249 .read = nvme_mmio_read,
8250 .write = nvme_mmio_write,
8251 .endianness = DEVICE_LITTLE_ENDIAN,
8252 .impl = {
8253 .min_access_size = 2,
8254 .max_access_size = 8,
8255 },
8256 };
8257
nvme_cmb_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)8258 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
8259 unsigned size)
8260 {
8261 NvmeCtrl *n = (NvmeCtrl *)opaque;
8262 stn_le_p(&n->cmb.buf[addr], size, data);
8263 }
8264
nvme_cmb_read(void * opaque,hwaddr addr,unsigned size)8265 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
8266 {
8267 NvmeCtrl *n = (NvmeCtrl *)opaque;
8268 return ldn_le_p(&n->cmb.buf[addr], size);
8269 }
8270
8271 static const MemoryRegionOps nvme_cmb_ops = {
8272 .read = nvme_cmb_read,
8273 .write = nvme_cmb_write,
8274 .endianness = DEVICE_LITTLE_ENDIAN,
8275 .impl = {
8276 .min_access_size = 1,
8277 .max_access_size = 8,
8278 },
8279 };
8280
nvme_check_params(NvmeCtrl * n,Error ** errp)8281 static bool nvme_check_params(NvmeCtrl *n, Error **errp)
8282 {
8283 NvmeParams *params = &n->params;
8284
8285 if (params->num_queues) {
8286 warn_report("num_queues is deprecated; please use max_ioqpairs "
8287 "instead");
8288
8289 params->max_ioqpairs = params->num_queues - 1;
8290 }
8291
8292 if (n->namespace.blkconf.blk && n->subsys) {
8293 error_setg(errp, "subsystem support is unavailable with legacy "
8294 "namespace ('drive' property)");
8295 return false;
8296 }
8297
8298 if (params->max_ioqpairs < 1 ||
8299 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
8300 error_setg(errp, "max_ioqpairs must be between 1 and %d",
8301 NVME_MAX_IOQPAIRS);
8302 return false;
8303 }
8304
8305 if (params->msix_qsize < 1 ||
8306 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
8307 error_setg(errp, "msix_qsize must be between 1 and %d",
8308 PCI_MSIX_FLAGS_QSIZE + 1);
8309 return false;
8310 }
8311
8312 if (!params->serial) {
8313 error_setg(errp, "serial property not set");
8314 return false;
8315 }
8316
8317 if (params->mqes < 1) {
8318 error_setg(errp, "mqes property cannot be less than 1");
8319 return false;
8320 }
8321
8322 if (n->pmr.dev) {
8323 if (params->msix_exclusive_bar) {
8324 error_setg(errp, "not enough BARs available to enable PMR");
8325 return false;
8326 }
8327
8328 if (host_memory_backend_is_mapped(n->pmr.dev)) {
8329 error_setg(errp, "can't use already busy memdev: %s",
8330 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
8331 return false;
8332 }
8333
8334 if (!is_power_of_2(n->pmr.dev->size)) {
8335 error_setg(errp, "pmr backend size needs to be power of 2 in size");
8336 return false;
8337 }
8338
8339 host_memory_backend_set_mapped(n->pmr.dev, true);
8340 }
8341
8342 if (!n->params.mdts || ((1 << n->params.mdts) + 1) > IOV_MAX) {
8343 error_setg(errp, "mdts exceeds IOV_MAX");
8344 return false;
8345 }
8346
8347 if (n->params.zasl > n->params.mdts) {
8348 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
8349 "than or equal to mdts (Maximum Data Transfer Size)");
8350 return false;
8351 }
8352
8353 if (!n->params.vsl) {
8354 error_setg(errp, "vsl must be non-zero");
8355 return false;
8356 }
8357
8358 if (params->sriov_max_vfs) {
8359 if (!n->subsys) {
8360 error_setg(errp, "subsystem is required for the use of SR-IOV");
8361 return false;
8362 }
8363
8364 if (params->cmb_size_mb) {
8365 error_setg(errp, "CMB is not supported with SR-IOV");
8366 return false;
8367 }
8368
8369 if (n->pmr.dev) {
8370 error_setg(errp, "PMR is not supported with SR-IOV");
8371 return false;
8372 }
8373
8374 if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
8375 error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
8376 " must be set for the use of SR-IOV");
8377 return false;
8378 }
8379
8380 if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
8381 error_setg(errp, "sriov_vq_flexible must be greater than or equal"
8382 " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2);
8383 return false;
8384 }
8385
8386 if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
8387 error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be"
8388 " greater than or equal to 2");
8389 return false;
8390 }
8391
8392 if (params->sriov_vi_flexible < params->sriov_max_vfs) {
8393 error_setg(errp, "sriov_vi_flexible must be greater than or equal"
8394 " to %d (sriov_max_vfs)", params->sriov_max_vfs);
8395 return false;
8396 }
8397
8398 if (params->msix_qsize < params->sriov_vi_flexible + 1) {
8399 error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be"
8400 " greater than or equal to 1");
8401 return false;
8402 }
8403
8404 if (params->sriov_max_vi_per_vf &&
8405 (params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
8406 error_setg(errp, "sriov_max_vi_per_vf must meet:"
8407 " (sriov_max_vi_per_vf - 1) %% %d == 0 and"
8408 " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY);
8409 return false;
8410 }
8411
8412 if (params->sriov_max_vq_per_vf &&
8413 (params->sriov_max_vq_per_vf < 2 ||
8414 (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
8415 error_setg(errp, "sriov_max_vq_per_vf must meet:"
8416 " (sriov_max_vq_per_vf - 1) %% %d == 0 and"
8417 " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY);
8418 return false;
8419 }
8420 }
8421
8422 return true;
8423 }
8424
nvme_init_state(NvmeCtrl * n)8425 static void nvme_init_state(NvmeCtrl *n)
8426 {
8427 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8428 NvmeSecCtrlEntry *list = n->sec_ctrl_list;
8429 NvmeSecCtrlEntry *sctrl;
8430 PCIDevice *pci = PCI_DEVICE(n);
8431 NvmeAtomic *atomic = &n->atomic;
8432 NvmeIdCtrl *id = &n->id_ctrl;
8433 uint8_t max_vfs;
8434 int i;
8435
8436 if (pci_is_vf(pci)) {
8437 sctrl = nvme_sctrl(n);
8438 max_vfs = 0;
8439 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
8440 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
8441 } else {
8442 max_vfs = n->params.sriov_max_vfs;
8443 n->conf_ioqpairs = n->params.max_ioqpairs;
8444 n->conf_msix_qsize = n->params.msix_qsize;
8445 }
8446
8447 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
8448 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
8449 n->temperature = NVME_TEMPERATURE;
8450 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
8451 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
8452 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
8453 QTAILQ_INIT(&n->aer_queue);
8454
8455 n->nr_sec_ctrls = max_vfs;
8456 for (i = 0; i < max_vfs; i++) {
8457 sctrl = &list[i];
8458 sctrl->pcid = cpu_to_le16(n->cntlid);
8459 sctrl->vfn = cpu_to_le16(i + 1);
8460 }
8461
8462 cap->cntlid = cpu_to_le16(n->cntlid);
8463 cap->crt = NVME_CRT_VQ | NVME_CRT_VI;
8464
8465 if (pci_is_vf(pci)) {
8466 cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs);
8467 } else {
8468 cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs -
8469 n->params.sriov_vq_flexible);
8470 cap->vqfrt = cpu_to_le32(n->params.sriov_vq_flexible);
8471 cap->vqrfap = cap->vqfrt;
8472 cap->vqgran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8473 cap->vqfrsm = n->params.sriov_max_vq_per_vf ?
8474 cpu_to_le16(n->params.sriov_max_vq_per_vf) :
8475 cap->vqfrt / MAX(max_vfs, 1);
8476 }
8477
8478 if (pci_is_vf(pci)) {
8479 cap->viprt = cpu_to_le16(n->conf_msix_qsize);
8480 } else {
8481 cap->viprt = cpu_to_le16(n->params.msix_qsize -
8482 n->params.sriov_vi_flexible);
8483 cap->vifrt = cpu_to_le32(n->params.sriov_vi_flexible);
8484 cap->virfap = cap->vifrt;
8485 cap->vigran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8486 cap->vifrsm = n->params.sriov_max_vi_per_vf ?
8487 cpu_to_le16(n->params.sriov_max_vi_per_vf) :
8488 cap->vifrt / MAX(max_vfs, 1);
8489 }
8490
8491 /* Atomic Write */
8492 id->awun = cpu_to_le16(n->params.atomic_awun);
8493 id->awupf = cpu_to_le16(n->params.atomic_awupf);
8494 n->dn = n->params.atomic_dn;
8495
8496 if (id->awun || id->awupf) {
8497 if (id->awupf > id->awun) {
8498 id->awupf = 0;
8499 }
8500
8501 if (n->dn) {
8502 atomic->atomic_max_write_size = id->awupf + 1;
8503 } else {
8504 atomic->atomic_max_write_size = id->awun + 1;
8505 }
8506
8507 if (atomic->atomic_max_write_size == 1) {
8508 atomic->atomic_writes = 0;
8509 } else {
8510 atomic->atomic_writes = 1;
8511 }
8512 }
8513 }
8514
nvme_init_cmb(NvmeCtrl * n,PCIDevice * pci_dev)8515 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
8516 {
8517 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
8518 uint64_t cap = ldq_le_p(&n->bar.cap);
8519
8520 n->cmb.buf = g_malloc0(cmb_size);
8521 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
8522 "nvme-cmb", cmb_size);
8523 pci_register_bar(pci_dev, NVME_CMB_BIR,
8524 PCI_BASE_ADDRESS_SPACE_MEMORY |
8525 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8526 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
8527
8528 NVME_CAP_SET_CMBS(cap, 1);
8529 stq_le_p(&n->bar.cap, cap);
8530
8531 if (n->params.legacy_cmb) {
8532 nvme_cmb_enable_regs(n);
8533 n->cmb.cmse = true;
8534 }
8535 }
8536
nvme_init_pmr(NvmeCtrl * n,PCIDevice * pci_dev)8537 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
8538 {
8539 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
8540
8541 NVME_PMRCAP_SET_RDS(pmrcap, 1);
8542 NVME_PMRCAP_SET_WDS(pmrcap, 1);
8543 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
8544 /* Turn on bit 1 support */
8545 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
8546 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
8547 stl_le_p(&n->bar.pmrcap, pmrcap);
8548
8549 pci_register_bar(pci_dev, NVME_PMR_BIR,
8550 PCI_BASE_ADDRESS_SPACE_MEMORY |
8551 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8552 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
8553
8554 memory_region_set_enabled(&n->pmr.dev->mr, false);
8555 }
8556
nvme_mbar_size(unsigned total_queues,unsigned total_irqs,unsigned * msix_table_offset,unsigned * msix_pba_offset)8557 static uint64_t nvme_mbar_size(unsigned total_queues, unsigned total_irqs,
8558 unsigned *msix_table_offset,
8559 unsigned *msix_pba_offset)
8560 {
8561 uint64_t bar_size, msix_table_size;
8562
8563 bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
8564
8565 if (total_irqs == 0) {
8566 goto out;
8567 }
8568
8569 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8570
8571 if (msix_table_offset) {
8572 *msix_table_offset = bar_size;
8573 }
8574
8575 msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
8576 bar_size += msix_table_size;
8577 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8578
8579 if (msix_pba_offset) {
8580 *msix_pba_offset = bar_size;
8581 }
8582
8583 bar_size += QEMU_ALIGN_UP(total_irqs, 64) / 8;
8584
8585 out:
8586 return pow2ceil(bar_size);
8587 }
8588
nvme_init_sriov(NvmeCtrl * n,PCIDevice * pci_dev,uint16_t offset,Error ** errp)8589 static bool nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset,
8590 Error **errp)
8591 {
8592 uint16_t vf_dev_id = n->params.use_intel_id ?
8593 PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
8594 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8595 uint64_t bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm),
8596 le16_to_cpu(cap->vifrsm),
8597 NULL, NULL);
8598
8599 if (!pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
8600 n->params.sriov_max_vfs, n->params.sriov_max_vfs,
8601 NVME_VF_OFFSET, NVME_VF_STRIDE, errp)) {
8602 return false;
8603 }
8604
8605 pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8606 PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
8607
8608 return true;
8609 }
8610
nvme_add_pm_capability(PCIDevice * pci_dev,uint8_t offset)8611 static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
8612 {
8613 Error *err = NULL;
8614 int ret;
8615
8616 ret = pci_pm_init(pci_dev, offset, &err);
8617 if (err) {
8618 error_report_err(err);
8619 return ret;
8620 }
8621
8622 pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
8623 PCI_PM_CAP_VER_1_2);
8624 pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
8625 PCI_PM_CTRL_NO_SOFT_RESET);
8626 pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
8627 PCI_PM_CTRL_STATE_MASK);
8628
8629 return 0;
8630 }
8631
pcie_doe_spdm_rsp(DOECap * doe_cap)8632 static bool pcie_doe_spdm_rsp(DOECap *doe_cap)
8633 {
8634 void *req = pcie_doe_get_write_mbox_ptr(doe_cap);
8635 uint32_t req_len = pcie_doe_get_obj_len(req) * 4;
8636 void *rsp = doe_cap->read_mbox;
8637 uint32_t rsp_len = SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE;
8638
8639 uint32_t recvd = spdm_socket_rsp(doe_cap->spdm_socket,
8640 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE,
8641 req, req_len, rsp, rsp_len);
8642 doe_cap->read_mbox_len += DIV_ROUND_UP(recvd, 4);
8643
8644 return recvd != 0;
8645 }
8646
8647 static DOEProtocol doe_spdm_prot[] = {
8648 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_CMA, pcie_doe_spdm_rsp },
8649 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_SECURED_CMA, pcie_doe_spdm_rsp },
8650 { }
8651 };
8652
nvme_init_pci(NvmeCtrl * n,PCIDevice * pci_dev,Error ** errp)8653 static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
8654 {
8655 ERRP_GUARD();
8656 uint8_t *pci_conf = pci_dev->config;
8657 uint64_t bar_size;
8658 unsigned msix_table_offset = 0, msix_pba_offset = 0;
8659 unsigned nr_vectors;
8660 int ret;
8661
8662 pci_conf[PCI_INTERRUPT_PIN] = pci_is_vf(pci_dev) ? 0 : 1;
8663 pci_config_set_prog_interface(pci_conf, 0x2);
8664
8665 if (n->params.use_intel_id) {
8666 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
8667 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
8668 } else {
8669 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
8670 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
8671 }
8672
8673 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
8674 nvme_add_pm_capability(pci_dev, 0x60);
8675 pcie_endpoint_cap_init(pci_dev, 0x80);
8676 pcie_cap_flr_init(pci_dev);
8677 if (n->params.sriov_max_vfs) {
8678 pcie_ari_init(pci_dev, 0x100);
8679 }
8680
8681 if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) {
8682 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, 0, NULL, NULL);
8683 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8684 bar_size);
8685 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8686 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
8687 ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp);
8688 } else {
8689 assert(n->params.msix_qsize >= 1);
8690
8691 /* add one to max_ioqpairs to account for the admin queue pair */
8692 if (!pci_is_vf(pci_dev)) {
8693 nr_vectors = n->params.msix_qsize;
8694 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1,
8695 nr_vectors, &msix_table_offset,
8696 &msix_pba_offset);
8697 } else {
8698 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8699 NvmePriCtrlCap *cap = &pn->pri_ctrl_cap;
8700
8701 nr_vectors = le16_to_cpu(cap->vifrsm);
8702 bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm), nr_vectors,
8703 &msix_table_offset, &msix_pba_offset);
8704 }
8705
8706 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
8707 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8708 msix_table_offset);
8709 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
8710
8711 if (pci_is_vf(pci_dev)) {
8712 pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
8713 } else {
8714 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8715 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
8716 }
8717
8718 ret = msix_init(pci_dev, nr_vectors,
8719 &n->bar0, 0, msix_table_offset,
8720 &n->bar0, 0, msix_pba_offset, 0, errp);
8721 }
8722
8723 if (ret == -ENOTSUP) {
8724 /* report that msix is not supported, but do not error out */
8725 warn_report_err(*errp);
8726 *errp = NULL;
8727 } else if (ret < 0) {
8728 /* propagate error to caller */
8729 return false;
8730 }
8731
8732 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs &&
8733 !nvme_init_sriov(n, pci_dev, 0x120, errp)) {
8734 return false;
8735 }
8736
8737 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
8738
8739 pcie_cap_deverr_init(pci_dev);
8740
8741 /* DOE Initialisation */
8742 if (pci_dev->spdm_port) {
8743 uint16_t doe_offset = n->params.sriov_max_vfs ?
8744 PCI_CONFIG_SPACE_SIZE + PCI_ARI_SIZEOF
8745 : PCI_CONFIG_SPACE_SIZE;
8746
8747 pcie_doe_init(pci_dev, &pci_dev->doe_spdm, doe_offset,
8748 doe_spdm_prot, true, 0);
8749
8750 pci_dev->doe_spdm.spdm_socket = spdm_socket_connect(pci_dev->spdm_port,
8751 errp);
8752
8753 if (pci_dev->doe_spdm.spdm_socket < 0) {
8754 return false;
8755 }
8756 }
8757
8758 if (n->params.cmb_size_mb) {
8759 nvme_init_cmb(n, pci_dev);
8760 }
8761
8762 if (n->pmr.dev) {
8763 nvme_init_pmr(n, pci_dev);
8764 }
8765
8766 return true;
8767 }
8768
nvme_init_subnqn(NvmeCtrl * n)8769 static void nvme_init_subnqn(NvmeCtrl *n)
8770 {
8771 NvmeSubsystem *subsys = n->subsys;
8772 NvmeIdCtrl *id = &n->id_ctrl;
8773
8774 if (!subsys) {
8775 snprintf((char *)id->subnqn, sizeof(id->subnqn),
8776 "nqn.2019-08.org.qemu:%s", n->params.serial);
8777 } else {
8778 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
8779 }
8780 }
8781
nvme_init_ctrl(NvmeCtrl * n,PCIDevice * pci_dev)8782 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
8783 {
8784 NvmeIdCtrl *id = &n->id_ctrl;
8785 uint8_t *pci_conf = pci_dev->config;
8786 uint64_t cap = ldq_le_p(&n->bar.cap);
8787 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
8788 uint32_t ctratt = le32_to_cpu(id->ctratt);
8789 uint16_t oacs;
8790
8791 memcpy(n->cse.acs, nvme_cse_acs_default, sizeof(n->cse.acs));
8792 memcpy(n->cse.iocs.nvm, nvme_cse_iocs_nvm_default, sizeof(n->cse.iocs.nvm));
8793 memcpy(n->cse.iocs.zoned, nvme_cse_iocs_zoned_default,
8794 sizeof(n->cse.iocs.zoned));
8795
8796 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
8797 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
8798 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
8799 strpadcpy((char *)id->fr, sizeof(id->fr), QEMU_VERSION, ' ');
8800 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
8801
8802 id->cntlid = cpu_to_le16(n->cntlid);
8803
8804 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
8805
8806 ctratt |= NVME_CTRATT_ELBAS;
8807 if (n->params.ctratt.mem) {
8808 ctratt |= NVME_CTRATT_MEM;
8809 }
8810 id->ctratt = cpu_to_le32(ctratt);
8811
8812 id->rab = 6;
8813
8814 if (n->params.use_intel_id) {
8815 id->ieee[0] = 0xb3;
8816 id->ieee[1] = 0x02;
8817 id->ieee[2] = 0x00;
8818 } else {
8819 id->ieee[0] = 0x00;
8820 id->ieee[1] = 0x54;
8821 id->ieee[2] = 0x52;
8822 }
8823
8824 id->mdts = n->params.mdts;
8825 id->ver = cpu_to_le32(NVME_SPEC_VER);
8826
8827 oacs = NVME_OACS_NMS | NVME_OACS_FORMAT | NVME_OACS_DIRECTIVES;
8828
8829 if (n->params.dbcs) {
8830 oacs |= NVME_OACS_DBCS;
8831
8832 n->cse.acs[NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP;
8833 }
8834
8835 if (n->params.sriov_max_vfs) {
8836 oacs |= NVME_OACS_VMS;
8837
8838 n->cse.acs[NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP;
8839 }
8840
8841 id->oacs = cpu_to_le16(oacs);
8842
8843 id->cntrltype = 0x1;
8844
8845 /*
8846 * Because the controller always completes the Abort command immediately,
8847 * there can never be more than one concurrently executing Abort command,
8848 * so this value is never used for anything. Note that there can easily be
8849 * many Abort commands in the queues, but they are not considered
8850 * "executing" until processed by nvme_abort.
8851 *
8852 * The specification recommends a value of 3 for Abort Command Limit (four
8853 * concurrently outstanding Abort commands), so lets use that though it is
8854 * inconsequential.
8855 */
8856 id->acl = 3;
8857 id->aerl = n->params.aerl;
8858 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
8859 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
8860
8861 /* recommended default value (~70 C) */
8862 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
8863 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
8864
8865 id->sqes = (NVME_SQES << 4) | NVME_SQES;
8866 id->cqes = (NVME_CQES << 4) | NVME_CQES;
8867 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
8868 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
8869 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
8870 NVME_ONCS_COMPARE | NVME_ONCS_COPY |
8871 NVME_ONCS_NVMCSA | NVME_ONCS_NVMAFC);
8872
8873 /*
8874 * NOTE: If this device ever supports a command set that does NOT use 0x0
8875 * as a Flush-equivalent operation, support for the broadcast NSID in Flush
8876 * should probably be removed.
8877 *
8878 * See comment in nvme_io_cmd.
8879 */
8880 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
8881
8882 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1 |
8883 NVME_OCFS_COPY_FORMAT_2 | NVME_OCFS_COPY_FORMAT_3);
8884 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
8885 NVME_CTRL_SGLS_MPTR_SGL);
8886
8887 nvme_init_subnqn(n);
8888
8889 id->psd[0].mp = cpu_to_le16(0x9c4);
8890 id->psd[0].enlat = cpu_to_le32(0x10);
8891 id->psd[0].exlat = cpu_to_le32(0x4);
8892
8893 NVME_CAP_SET_MQES(cap, n->params.mqes);
8894 NVME_CAP_SET_CQR(cap, 1);
8895 NVME_CAP_SET_TO(cap, 0xf);
8896 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NCSS);
8897 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_IOCSS);
8898 NVME_CAP_SET_MPSMAX(cap, 4);
8899 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
8900 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
8901 stq_le_p(&n->bar.cap, cap);
8902
8903 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
8904 n->bar.intmc = n->bar.intms = 0;
8905
8906 if (pci_is_vf(pci_dev) && !sctrl->scs) {
8907 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
8908 }
8909 }
8910
nvme_init_subsys(NvmeCtrl * n,Error ** errp)8911 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
8912 {
8913 int cntlid;
8914
8915 if (!n->subsys) {
8916 DeviceState *dev = qdev_new(TYPE_NVME_SUBSYS);
8917
8918 qdev_prop_set_string(dev, "nqn", n->params.serial);
8919
8920 if (!qdev_realize(dev, NULL, errp)) {
8921 return -1;
8922 }
8923
8924 n->subsys = NVME_SUBSYS(dev);
8925 } else {
8926 NvmeIdCtrl *id = &n->id_ctrl;
8927 uint32_t ctratt = le32_to_cpu(id->ctratt);
8928
8929 id->cmic |= NVME_CMIC_MULTI_CTRL;
8930 ctratt |= NVME_CTRATT_ENDGRPS;
8931
8932 id->endgidmax = cpu_to_le16(0x1);
8933
8934 if (n->subsys->endgrp.fdp.enabled) {
8935 ctratt |= NVME_CTRATT_FDPS;
8936 }
8937
8938 id->ctratt = cpu_to_le32(ctratt);
8939 }
8940
8941 cntlid = nvme_subsys_register_ctrl(n, errp);
8942 if (cntlid < 0) {
8943 return -1;
8944 }
8945
8946 n->cntlid = cntlid;
8947
8948 return 0;
8949 }
8950
nvme_attach_ns(NvmeCtrl * n,NvmeNamespace * ns)8951 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
8952 {
8953 uint32_t nsid = ns->params.nsid;
8954 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
8955
8956 n->namespaces[nsid] = ns;
8957 ns->attached++;
8958 }
8959
nvme_realize(PCIDevice * pci_dev,Error ** errp)8960 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
8961 {
8962 NvmeCtrl *n = NVME(pci_dev);
8963 DeviceState *dev = DEVICE(pci_dev);
8964 NvmeNamespace *ns;
8965 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8966
8967 if (pci_is_vf(pci_dev)) {
8968 /*
8969 * VFs derive settings from the parent. PF's lifespan exceeds
8970 * that of VF's.
8971 */
8972 memcpy(&n->params, &pn->params, sizeof(NvmeParams));
8973
8974 /*
8975 * Set PF's serial value to a new string memory to prevent 'serial'
8976 * property object release of PF when a VF is removed from the system.
8977 */
8978 n->params.serial = g_strdup(pn->params.serial);
8979 n->subsys = pn->subsys;
8980
8981 /*
8982 * Assigning this link (strong link) causes an `object_unref` later in
8983 * `object_release_link_property`. Increment the refcount to balance
8984 * this out.
8985 */
8986 object_ref(OBJECT(pn->subsys));
8987 }
8988
8989 if (!nvme_check_params(n, errp)) {
8990 return;
8991 }
8992
8993 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
8994
8995 if (nvme_init_subsys(n, errp)) {
8996 return;
8997 }
8998 nvme_init_state(n);
8999 if (!nvme_init_pci(n, pci_dev, errp)) {
9000 return;
9001 }
9002 nvme_init_ctrl(n, pci_dev);
9003
9004 /* setup a namespace if the controller drive property was given */
9005 if (n->namespace.blkconf.blk) {
9006 ns = &n->namespace;
9007 ns->params.nsid = 1;
9008 ns->ctrl = n;
9009
9010 if (nvme_ns_setup(ns, errp)) {
9011 return;
9012 }
9013
9014 n->subsys->namespaces[ns->params.nsid] = ns;
9015 }
9016 }
9017
nvme_exit(PCIDevice * pci_dev)9018 static void nvme_exit(PCIDevice *pci_dev)
9019 {
9020 NvmeCtrl *n = NVME(pci_dev);
9021 NvmeNamespace *ns;
9022 int i;
9023
9024 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
9025
9026 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
9027 ns = nvme_ns(n, i);
9028 if (ns) {
9029 ns->attached--;
9030 }
9031 }
9032
9033 nvme_subsys_unregister_ctrl(n->subsys, n);
9034
9035 g_free(n->cq);
9036 g_free(n->sq);
9037 g_free(n->aer_reqs);
9038
9039 if (n->params.cmb_size_mb) {
9040 g_free(n->cmb.buf);
9041 }
9042
9043 if (pci_dev->doe_spdm.spdm_socket > 0) {
9044 spdm_socket_close(pci_dev->doe_spdm.spdm_socket,
9045 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE);
9046 }
9047
9048 if (n->pmr.dev) {
9049 host_memory_backend_set_mapped(n->pmr.dev, false);
9050 }
9051
9052 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
9053 pcie_sriov_pf_exit(pci_dev);
9054 }
9055
9056 if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) {
9057 msix_uninit_exclusive_bar(pci_dev);
9058 } else {
9059 msix_uninit(pci_dev, &n->bar0, &n->bar0);
9060 }
9061
9062 memory_region_del_subregion(&n->bar0, &n->iomem);
9063 }
9064
9065 static const Property nvme_props[] = {
9066 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
9067 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
9068 HostMemoryBackend *),
9069 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
9070 NvmeSubsystem *),
9071 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
9072 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
9073 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
9074 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
9075 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
9076 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
9077 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
9078 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
9079 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
9080 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
9081 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
9082 DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
9083 DEFINE_PROP_BOOL("dbcs", NvmeCtrl, params.dbcs, true),
9084 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
9085 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
9086 params.auto_transition_zones, true),
9087 DEFINE_PROP_UINT16("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
9088 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
9089 params.sriov_vq_flexible, 0),
9090 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
9091 params.sriov_vi_flexible, 0),
9092 DEFINE_PROP_UINT32("sriov_max_vi_per_vf", NvmeCtrl,
9093 params.sriov_max_vi_per_vf, 0),
9094 DEFINE_PROP_UINT32("sriov_max_vq_per_vf", NvmeCtrl,
9095 params.sriov_max_vq_per_vf, 0),
9096 DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar,
9097 false),
9098 DEFINE_PROP_UINT16("mqes", NvmeCtrl, params.mqes, 0x7ff),
9099 DEFINE_PROP_UINT16("spdm_port", PCIDevice, spdm_port, 0),
9100 DEFINE_PROP_BOOL("ctratt.mem", NvmeCtrl, params.ctratt.mem, false),
9101 DEFINE_PROP_BOOL("atomic.dn", NvmeCtrl, params.atomic_dn, 0),
9102 DEFINE_PROP_UINT16("atomic.awun", NvmeCtrl, params.atomic_awun, 0),
9103 DEFINE_PROP_UINT16("atomic.awupf", NvmeCtrl, params.atomic_awupf, 0),
9104 DEFINE_PROP_BOOL("ocp", NvmeCtrl, params.ocp, false),
9105 };
9106
nvme_get_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)9107 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
9108 void *opaque, Error **errp)
9109 {
9110 NvmeCtrl *n = NVME(obj);
9111 uint8_t value = n->smart_critical_warning;
9112
9113 visit_type_uint8(v, name, &value, errp);
9114 }
9115
nvme_set_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)9116 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
9117 void *opaque, Error **errp)
9118 {
9119 NvmeCtrl *n = NVME(obj);
9120 uint8_t value, old_value, cap = 0, index, event;
9121
9122 if (!visit_type_uint8(v, name, &value, errp)) {
9123 return;
9124 }
9125
9126 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
9127 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
9128 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
9129 cap |= NVME_SMART_PMR_UNRELIABLE;
9130 }
9131
9132 if ((value & cap) != value) {
9133 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
9134 value & ~cap);
9135 return;
9136 }
9137
9138 old_value = n->smart_critical_warning;
9139 n->smart_critical_warning = value;
9140
9141 /* only inject new bits of smart critical warning */
9142 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
9143 event = 1 << index;
9144 if (value & ~old_value & event)
9145 nvme_smart_event(n, event);
9146 }
9147 }
9148
nvme_pci_reset(DeviceState * qdev)9149 static void nvme_pci_reset(DeviceState *qdev)
9150 {
9151 PCIDevice *pci_dev = PCI_DEVICE(qdev);
9152 NvmeCtrl *n = NVME(pci_dev);
9153
9154 trace_pci_nvme_pci_reset();
9155 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
9156 }
9157
nvme_sriov_post_write_config(PCIDevice * dev,uint16_t old_num_vfs)9158 static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs)
9159 {
9160 NvmeCtrl *n = NVME(dev);
9161 NvmeSecCtrlEntry *sctrl;
9162 int i;
9163
9164 for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) {
9165 sctrl = &n->sec_ctrl_list[i];
9166 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
9167 }
9168 }
9169
nvme_pci_write_config(PCIDevice * dev,uint32_t address,uint32_t val,int len)9170 static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
9171 uint32_t val, int len)
9172 {
9173 uint16_t old_num_vfs = pcie_sriov_num_vfs(dev);
9174
9175 if (pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
9176 pcie_doe_write_config(&dev->doe_spdm, address, val, len);
9177 }
9178 pci_default_write_config(dev, address, val, len);
9179 pcie_cap_flr_write_config(dev, address, val, len);
9180 nvme_sriov_post_write_config(dev, old_num_vfs);
9181 }
9182
nvme_pci_read_config(PCIDevice * dev,uint32_t address,int len)9183 static uint32_t nvme_pci_read_config(PCIDevice *dev, uint32_t address, int len)
9184 {
9185 uint32_t val;
9186 if (dev->spdm_port && pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
9187 if (pcie_doe_read_config(&dev->doe_spdm, address, len, &val)) {
9188 return val;
9189 }
9190 }
9191 return pci_default_read_config(dev, address, len);
9192 }
9193
9194 static const VMStateDescription nvme_vmstate = {
9195 .name = "nvme",
9196 .unmigratable = 1,
9197 };
9198
nvme_class_init(ObjectClass * oc,const void * data)9199 static void nvme_class_init(ObjectClass *oc, const void *data)
9200 {
9201 DeviceClass *dc = DEVICE_CLASS(oc);
9202 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
9203
9204 pc->realize = nvme_realize;
9205 pc->config_write = nvme_pci_write_config;
9206 pc->config_read = nvme_pci_read_config;
9207 pc->exit = nvme_exit;
9208 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
9209 pc->revision = 2;
9210
9211 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
9212 dc->desc = "Non-Volatile Memory Express";
9213 device_class_set_props(dc, nvme_props);
9214 dc->vmsd = &nvme_vmstate;
9215 device_class_set_legacy_reset(dc, nvme_pci_reset);
9216 }
9217
nvme_instance_init(Object * obj)9218 static void nvme_instance_init(Object *obj)
9219 {
9220 NvmeCtrl *n = NVME(obj);
9221
9222 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
9223 "bootindex", "/namespace@1,0",
9224 DEVICE(obj));
9225
9226 object_property_add(obj, "smart_critical_warning", "uint8",
9227 nvme_get_smart_warning,
9228 nvme_set_smart_warning, NULL, NULL);
9229 }
9230
9231 static const TypeInfo nvme_info = {
9232 .name = TYPE_NVME,
9233 .parent = TYPE_PCI_DEVICE,
9234 .instance_size = sizeof(NvmeCtrl),
9235 .instance_init = nvme_instance_init,
9236 .class_init = nvme_class_init,
9237 .interfaces = (const InterfaceInfo[]) {
9238 { INTERFACE_PCIE_DEVICE },
9239 { }
9240 },
9241 };
9242
9243 static const TypeInfo nvme_bus_info = {
9244 .name = TYPE_NVME_BUS,
9245 .parent = TYPE_BUS,
9246 .instance_size = sizeof(NvmeBus),
9247 };
9248
nvme_register_types(void)9249 static void nvme_register_types(void)
9250 {
9251 type_register_static(&nvme_info);
9252 type_register_static(&nvme_bus_info);
9253 }
9254
9255 type_init(nvme_register_types)
9256