1 /*
2 * QEMU NVM Express Controller
3 *
4 * Copyright (c) 2012, Intel Corporation
5 *
6 * Written by Keith Busch <keith.busch@intel.com>
7 *
8 * This code is licensed under the GNU GPL v2 or later.
9 */
10
11 /**
12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13 *
14 * https://nvmexpress.org/developers/nvme-specification/
15 *
16 *
17 * Notes on coding style
18 * ---------------------
19 * While QEMU coding style prefers lowercase hexadecimals in constants, the
20 * NVMe subsystem use this format from the NVMe specifications in the comments
21 * (i.e. 'h' suffix instead of '0x' prefix).
22 *
23 * Usage
24 * -----
25 * See docs/system/nvme.rst for extensive documentation.
26 *
27 * Add options:
28 * -drive file=<file>,if=none,id=<drive_id>
29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30 * -device nvme,serial=<serial>,id=<bus_name>, \
31 * cmb_size_mb=<cmb_size_mb[optional]>, \
32 * [pmrdev=<mem_backend_file_id>,] \
33 * max_ioqpairs=<N[optional]>, \
34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35 * mdts=<N[optional]>,vsl=<N[optional]>, \
36 * zoned.zasl=<N[optional]>, \
37 * zoned.auto_transition=<on|off[optional]>, \
38 * sriov_max_vfs=<N[optional]> \
39 * sriov_vq_flexible=<N[optional]> \
40 * sriov_vi_flexible=<N[optional]> \
41 * sriov_max_vi_per_vf=<N[optional]> \
42 * sriov_max_vq_per_vf=<N[optional]> \
43 * subsys=<subsys_id>
44 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
45 * zoned=<true|false[optional]>, \
46 * subsys=<subsys_id>,shared=<true|false[optional]>, \
47 * detached=<true|false[optional]>, \
48 * zoned.zone_size=<N[optional]>, \
49 * zoned.zone_capacity=<N[optional]>, \
50 * zoned.descr_ext_size=<N[optional]>, \
51 * zoned.max_active=<N[optional]>, \
52 * zoned.max_open=<N[optional]>, \
53 * zoned.cross_read=<true|false[optional]>
54 *
55 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
56 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
57 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
58 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
59 *
60 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
61 * For example:
62 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
63 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
64 *
65 * The PMR will use BAR 4/5 exclusively.
66 *
67 * To place controller(s) and namespace(s) to a subsystem, then provide
68 * nvme-subsys device as above.
69 *
70 * nvme subsystem device parameters
71 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
72 * - `nqn`
73 * This parameter provides the `<nqn_id>` part of the string
74 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
75 * of subsystem controllers. Note that `<nqn_id>` should be unique per
76 * subsystem, but this is not enforced by QEMU. If not specified, it will
77 * default to the value of the `id` parameter (`<subsys_id>`).
78 *
79 * nvme device parameters
80 * ~~~~~~~~~~~~~~~~~~~~~~
81 * - `subsys`
82 * Specifying this parameter attaches the controller to the subsystem and
83 * the SUBNQN field in the controller will report the NQN of the subsystem
84 * device. This also enables multi controller capability represented in
85 * Identify Controller data structure in CMIC (Controller Multi-path I/O and
86 * Namespace Sharing Capabilities).
87 *
88 * - `aerl`
89 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number
90 * of concurrently outstanding Asynchronous Event Request commands support
91 * by the controller. This is a 0's based value.
92 *
93 * - `aer_max_queued`
94 * This is the maximum number of events that the device will enqueue for
95 * completion when there are no outstanding AERs. When the maximum number of
96 * enqueued events are reached, subsequent events will be dropped.
97 *
98 * - `mdts`
99 * Indicates the maximum data transfer size for a command that transfers data
100 * between host-accessible memory and the controller. The value is specified
101 * as a power of two (2^n) and is in units of the minimum memory page size
102 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
103 *
104 * - `vsl`
105 * Indicates the maximum data size limit for the Verify command. Like `mdts`,
106 * this value is specified as a power of two (2^n) and is in units of the
107 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
108 * KiB).
109 *
110 * - `zoned.zasl`
111 * Indicates the maximum data transfer size for the Zone Append command. Like
112 * `mdts`, the value is specified as a power of two (2^n) and is in units of
113 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
114 * defaulting to the value of `mdts`).
115 *
116 * - `zoned.auto_transition`
117 * Indicates if zones in zone state implicitly opened can be automatically
118 * transitioned to zone state closed for resource management purposes.
119 * Defaults to 'on'.
120 *
121 * - `sriov_max_vfs`
122 * Indicates the maximum number of PCIe virtual functions supported
123 * by the controller. The default value is 0. Specifying a non-zero value
124 * enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
125 * Virtual function controllers will not report SR-IOV capability.
126 *
127 * NOTE: Single Root I/O Virtualization support is experimental.
128 * All the related parameters may be subject to change.
129 *
130 * - `sriov_vq_flexible`
131 * Indicates the total number of flexible queue resources assignable to all
132 * the secondary controllers. Implicitly sets the number of primary
133 * controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`.
134 *
135 * - `sriov_vi_flexible`
136 * Indicates the total number of flexible interrupt resources assignable to
137 * all the secondary controllers. Implicitly sets the number of primary
138 * controller's private resources to `(msix_qsize - sriov_vi_flexible)`.
139 *
140 * - `sriov_max_vi_per_vf`
141 * Indicates the maximum number of virtual interrupt resources assignable
142 * to a secondary controller. The default 0 resolves to
143 * `(sriov_vi_flexible / sriov_max_vfs)`.
144 *
145 * - `sriov_max_vq_per_vf`
146 * Indicates the maximum number of virtual queue resources assignable to
147 * a secondary controller. The default 0 resolves to
148 * `(sriov_vq_flexible / sriov_max_vfs)`.
149 *
150 * nvme namespace device parameters
151 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
152 * - `shared`
153 * When the parent nvme device (as defined explicitly by the 'bus' parameter
154 * or implicitly by the most recently defined NvmeBus) is linked to an
155 * nvme-subsys device, the namespace will be attached to all controllers in
156 * the subsystem. If set to 'off' (the default), the namespace will remain a
157 * private namespace and may only be attached to a single controller at a
158 * time.
159 *
160 * - `detached`
161 * This parameter is only valid together with the `subsys` parameter. If left
162 * at the default value (`false/off`), the namespace will be attached to all
163 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
164 * namespace will be available in the subsystem but not attached to any
165 * controllers.
166 *
167 * Setting `zoned` to true selects Zoned Command Set at the namespace.
168 * In this case, the following namespace properties are available to configure
169 * zoned operation:
170 * zoned.zone_size=<zone size in bytes, default: 128MiB>
171 * The number may be followed by K, M, G as in kilo-, mega- or giga-.
172 *
173 * zoned.zone_capacity=<zone capacity in bytes, default: zone size>
174 * The value 0 (default) forces zone capacity to be the same as zone
175 * size. The value of this property may not exceed zone size.
176 *
177 * zoned.descr_ext_size=<zone descriptor extension size, default 0>
178 * This value needs to be specified in 64B units. If it is zero,
179 * namespace(s) will not support zone descriptor extensions.
180 *
181 * zoned.max_active=<Maximum Active Resources (zones), default: 0>
182 * The default value means there is no limit to the number of
183 * concurrently active zones.
184 *
185 * zoned.max_open=<Maximum Open Resources (zones), default: 0>
186 * The default value means there is no limit to the number of
187 * concurrently open zones.
188 *
189 * zoned.cross_read=<enable RAZB, default: false>
190 * Setting this property to true enables Read Across Zone Boundaries.
191 */
192
193 #include "qemu/osdep.h"
194 #include "qemu/cutils.h"
195 #include "qemu/error-report.h"
196 #include "qemu/log.h"
197 #include "qemu/units.h"
198 #include "qemu/range.h"
199 #include "qapi/error.h"
200 #include "qapi/visitor.h"
201 #include "sysemu/sysemu.h"
202 #include "sysemu/block-backend.h"
203 #include "sysemu/hostmem.h"
204 #include "hw/pci/msix.h"
205 #include "hw/pci/pcie_sriov.h"
206 #include "sysemu/spdm-socket.h"
207 #include "migration/vmstate.h"
208
209 #include "nvme.h"
210 #include "dif.h"
211 #include "trace.h"
212
213 #define NVME_MAX_IOQPAIRS 0xffff
214 #define NVME_DB_SIZE 4
215 #define NVME_SPEC_VER 0x00010400
216 #define NVME_CMB_BIR 2
217 #define NVME_PMR_BIR 4
218 #define NVME_TEMPERATURE 0x143
219 #define NVME_TEMPERATURE_WARNING 0x157
220 #define NVME_TEMPERATURE_CRITICAL 0x175
221 #define NVME_NUM_FW_SLOTS 1
222 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
223 #define NVME_VF_RES_GRANULARITY 1
224 #define NVME_VF_OFFSET 0x1
225 #define NVME_VF_STRIDE 1
226
227 #define NVME_GUEST_ERR(trace, fmt, ...) \
228 do { \
229 (trace_##trace)(__VA_ARGS__); \
230 qemu_log_mask(LOG_GUEST_ERROR, #trace \
231 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
232 } while (0)
233
234 static const bool nvme_feature_support[NVME_FID_MAX] = {
235 [NVME_ARBITRATION] = true,
236 [NVME_POWER_MANAGEMENT] = true,
237 [NVME_TEMPERATURE_THRESHOLD] = true,
238 [NVME_ERROR_RECOVERY] = true,
239 [NVME_VOLATILE_WRITE_CACHE] = true,
240 [NVME_NUMBER_OF_QUEUES] = true,
241 [NVME_INTERRUPT_COALESCING] = true,
242 [NVME_INTERRUPT_VECTOR_CONF] = true,
243 [NVME_WRITE_ATOMICITY] = true,
244 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
245 [NVME_TIMESTAMP] = true,
246 [NVME_HOST_BEHAVIOR_SUPPORT] = true,
247 [NVME_COMMAND_SET_PROFILE] = true,
248 [NVME_FDP_MODE] = true,
249 [NVME_FDP_EVENTS] = true,
250 };
251
252 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
253 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
254 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
255 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
256 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
257 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
258 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
259 [NVME_HOST_BEHAVIOR_SUPPORT] = NVME_FEAT_CAP_CHANGE,
260 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
261 [NVME_FDP_MODE] = NVME_FEAT_CAP_CHANGE,
262 [NVME_FDP_EVENTS] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
263 };
264
265 static const uint32_t nvme_cse_acs[256] = {
266 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
267 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
268 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
269 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
270 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
271 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
272 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
273 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
274 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
275 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
276 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
277 [NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP,
278 [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP,
279 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
280 [NVME_ADM_CMD_DIRECTIVE_RECV] = NVME_CMD_EFF_CSUPP,
281 [NVME_ADM_CMD_DIRECTIVE_SEND] = NVME_CMD_EFF_CSUPP,
282 };
283
284 static const uint32_t nvme_cse_iocs_none[256];
285
286 static const uint32_t nvme_cse_iocs_nvm[256] = {
287 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
288 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
289 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
290 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
291 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
292 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
293 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
294 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
295 [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
296 [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
297 };
298
299 static const uint32_t nvme_cse_iocs_zoned[256] = {
300 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
301 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
302 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
303 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
304 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
305 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
306 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
307 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
308 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
309 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
310 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
311 };
312
313 static void nvme_process_sq(void *opaque);
314 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
315 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n);
316
nvme_sqid(NvmeRequest * req)317 static uint16_t nvme_sqid(NvmeRequest *req)
318 {
319 return le16_to_cpu(req->sq->sqid);
320 }
321
nvme_make_pid(NvmeNamespace * ns,uint16_t rg,uint16_t ph)322 static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg,
323 uint16_t ph)
324 {
325 uint16_t rgif = ns->endgrp->fdp.rgif;
326
327 if (!rgif) {
328 return ph;
329 }
330
331 return (rg << (16 - rgif)) | ph;
332 }
333
nvme_ph_valid(NvmeNamespace * ns,uint16_t ph)334 static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph)
335 {
336 return ph < ns->fdp.nphs;
337 }
338
nvme_rg_valid(NvmeEnduranceGroup * endgrp,uint16_t rg)339 static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg)
340 {
341 return rg < endgrp->fdp.nrg;
342 }
343
nvme_pid2ph(NvmeNamespace * ns,uint16_t pid)344 static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid)
345 {
346 uint16_t rgif = ns->endgrp->fdp.rgif;
347
348 if (!rgif) {
349 return pid;
350 }
351
352 return pid & ((1 << (15 - rgif)) - 1);
353 }
354
nvme_pid2rg(NvmeNamespace * ns,uint16_t pid)355 static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid)
356 {
357 uint16_t rgif = ns->endgrp->fdp.rgif;
358
359 if (!rgif) {
360 return 0;
361 }
362
363 return pid >> (16 - rgif);
364 }
365
nvme_parse_pid(NvmeNamespace * ns,uint16_t pid,uint16_t * ph,uint16_t * rg)366 static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid,
367 uint16_t *ph, uint16_t *rg)
368 {
369 *rg = nvme_pid2rg(ns, pid);
370 *ph = nvme_pid2ph(ns, pid);
371
372 return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg);
373 }
374
nvme_assign_zone_state(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state)375 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
376 NvmeZoneState state)
377 {
378 if (QTAILQ_IN_USE(zone, entry)) {
379 switch (nvme_get_zone_state(zone)) {
380 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
381 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
382 break;
383 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
384 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
385 break;
386 case NVME_ZONE_STATE_CLOSED:
387 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
388 break;
389 case NVME_ZONE_STATE_FULL:
390 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
391 default:
392 ;
393 }
394 }
395
396 nvme_set_zone_state(zone, state);
397
398 switch (state) {
399 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
400 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
401 break;
402 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
403 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
404 break;
405 case NVME_ZONE_STATE_CLOSED:
406 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
407 break;
408 case NVME_ZONE_STATE_FULL:
409 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
410 case NVME_ZONE_STATE_READ_ONLY:
411 break;
412 default:
413 zone->d.za = 0;
414 }
415 }
416
nvme_zns_check_resources(NvmeNamespace * ns,uint32_t act,uint32_t opn,uint32_t zrwa)417 static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
418 uint32_t opn, uint32_t zrwa)
419 {
420 if (ns->params.max_active_zones != 0 &&
421 ns->nr_active_zones + act > ns->params.max_active_zones) {
422 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
423 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
424 }
425
426 if (ns->params.max_open_zones != 0 &&
427 ns->nr_open_zones + opn > ns->params.max_open_zones) {
428 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
429 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
430 }
431
432 if (zrwa > ns->zns.numzrwa) {
433 return NVME_NOZRWA | NVME_DNR;
434 }
435
436 return NVME_SUCCESS;
437 }
438
439 /*
440 * Check if we can open a zone without exceeding open/active limits.
441 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
442 */
nvme_aor_check(NvmeNamespace * ns,uint32_t act,uint32_t opn)443 static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
444 {
445 return nvme_zns_check_resources(ns, act, opn, 0);
446 }
447
nvme_fdp_alloc_event(NvmeCtrl * n,NvmeFdpEventBuffer * ebuf)448 static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf)
449 {
450 NvmeFdpEvent *ret = NULL;
451 bool is_full = ebuf->next == ebuf->start && ebuf->nelems;
452
453 ret = &ebuf->events[ebuf->next++];
454 if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) {
455 ebuf->next = 0;
456 }
457 if (is_full) {
458 ebuf->start = ebuf->next;
459 } else {
460 ebuf->nelems++;
461 }
462
463 memset(ret, 0, sizeof(NvmeFdpEvent));
464 ret->timestamp = nvme_get_timestamp(n);
465
466 return ret;
467 }
468
log_event(NvmeRuHandle * ruh,uint8_t event_type)469 static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type)
470 {
471 return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1;
472 }
473
nvme_update_ruh(NvmeCtrl * n,NvmeNamespace * ns,uint16_t pid)474 static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid)
475 {
476 NvmeEnduranceGroup *endgrp = ns->endgrp;
477 NvmeRuHandle *ruh;
478 NvmeReclaimUnit *ru;
479 NvmeFdpEvent *e = NULL;
480 uint16_t ph, rg, ruhid;
481
482 if (!nvme_parse_pid(ns, pid, &ph, &rg)) {
483 return false;
484 }
485
486 ruhid = ns->fdp.phs[ph];
487
488 ruh = &endgrp->fdp.ruhs[ruhid];
489 ru = &ruh->rus[rg];
490
491 if (ru->ruamw) {
492 if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) {
493 e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events);
494 e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN;
495 e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV;
496 e->pid = cpu_to_le16(pid);
497 e->nsid = cpu_to_le32(ns->params.nsid);
498 e->rgid = cpu_to_le16(rg);
499 e->ruhid = cpu_to_le16(ruhid);
500 }
501
502 /* log (eventual) GC overhead of prematurely swapping the RU */
503 nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw));
504 }
505
506 ru->ruamw = ruh->ruamw;
507
508 return true;
509 }
510
nvme_addr_is_cmb(NvmeCtrl * n,hwaddr addr)511 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
512 {
513 hwaddr hi, lo;
514
515 if (!n->cmb.cmse) {
516 return false;
517 }
518
519 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
520 hi = lo + int128_get64(n->cmb.mem.size);
521
522 return addr >= lo && addr < hi;
523 }
524
nvme_addr_to_cmb(NvmeCtrl * n,hwaddr addr)525 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
526 {
527 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
528 return &n->cmb.buf[addr - base];
529 }
530
nvme_addr_is_pmr(NvmeCtrl * n,hwaddr addr)531 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
532 {
533 hwaddr hi;
534
535 if (!n->pmr.cmse) {
536 return false;
537 }
538
539 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
540
541 return addr >= n->pmr.cba && addr < hi;
542 }
543
nvme_addr_to_pmr(NvmeCtrl * n,hwaddr addr)544 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
545 {
546 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
547 }
548
nvme_addr_is_iomem(NvmeCtrl * n,hwaddr addr)549 static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
550 {
551 hwaddr hi, lo;
552
553 /*
554 * The purpose of this check is to guard against invalid "local" access to
555 * the iomem (i.e. controller registers). Thus, we check against the range
556 * covered by the 'bar0' MemoryRegion since that is currently composed of
557 * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however,
558 * that if the device model is ever changed to allow the CMB to be located
559 * in BAR0 as well, then this must be changed.
560 */
561 lo = n->bar0.addr;
562 hi = lo + int128_get64(n->bar0.size);
563
564 return addr >= lo && addr < hi;
565 }
566
nvme_addr_read(NvmeCtrl * n,hwaddr addr,void * buf,int size)567 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
568 {
569 hwaddr hi = addr + size - 1;
570 if (hi < addr) {
571 return 1;
572 }
573
574 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
575 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
576 return 0;
577 }
578
579 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
580 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
581 return 0;
582 }
583
584 return pci_dma_read(PCI_DEVICE(n), addr, buf, size);
585 }
586
nvme_addr_write(NvmeCtrl * n,hwaddr addr,const void * buf,int size)587 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
588 {
589 hwaddr hi = addr + size - 1;
590 if (hi < addr) {
591 return 1;
592 }
593
594 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
595 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
596 return 0;
597 }
598
599 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
600 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
601 return 0;
602 }
603
604 return pci_dma_write(PCI_DEVICE(n), addr, buf, size);
605 }
606
nvme_nsid_valid(NvmeCtrl * n,uint32_t nsid)607 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
608 {
609 return nsid &&
610 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
611 }
612
nvme_check_sqid(NvmeCtrl * n,uint16_t sqid)613 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
614 {
615 return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
616 }
617
nvme_check_cqid(NvmeCtrl * n,uint16_t cqid)618 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
619 {
620 return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
621 }
622
nvme_inc_cq_tail(NvmeCQueue * cq)623 static void nvme_inc_cq_tail(NvmeCQueue *cq)
624 {
625 cq->tail++;
626 if (cq->tail >= cq->size) {
627 cq->tail = 0;
628 cq->phase = !cq->phase;
629 }
630 }
631
nvme_inc_sq_head(NvmeSQueue * sq)632 static void nvme_inc_sq_head(NvmeSQueue *sq)
633 {
634 sq->head = (sq->head + 1) % sq->size;
635 }
636
nvme_cq_full(NvmeCQueue * cq)637 static uint8_t nvme_cq_full(NvmeCQueue *cq)
638 {
639 return (cq->tail + 1) % cq->size == cq->head;
640 }
641
nvme_sq_empty(NvmeSQueue * sq)642 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
643 {
644 return sq->head == sq->tail;
645 }
646
nvme_irq_check(NvmeCtrl * n)647 static void nvme_irq_check(NvmeCtrl *n)
648 {
649 PCIDevice *pci = PCI_DEVICE(n);
650 uint32_t intms = ldl_le_p(&n->bar.intms);
651
652 if (msix_enabled(pci)) {
653 return;
654 }
655 if (~intms & n->irq_status) {
656 pci_irq_assert(pci);
657 } else {
658 pci_irq_deassert(pci);
659 }
660 }
661
nvme_irq_assert(NvmeCtrl * n,NvmeCQueue * cq)662 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
663 {
664 PCIDevice *pci = PCI_DEVICE(n);
665
666 if (cq->irq_enabled) {
667 if (msix_enabled(pci)) {
668 trace_pci_nvme_irq_msix(cq->vector);
669 msix_notify(pci, cq->vector);
670 } else {
671 trace_pci_nvme_irq_pin();
672 assert(cq->vector < 32);
673 n->irq_status |= 1 << cq->vector;
674 nvme_irq_check(n);
675 }
676 } else {
677 trace_pci_nvme_irq_masked();
678 }
679 }
680
nvme_irq_deassert(NvmeCtrl * n,NvmeCQueue * cq)681 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
682 {
683 if (cq->irq_enabled) {
684 if (msix_enabled(PCI_DEVICE(n))) {
685 return;
686 } else {
687 assert(cq->vector < 32);
688 if (!n->cq_pending) {
689 n->irq_status &= ~(1 << cq->vector);
690 }
691 nvme_irq_check(n);
692 }
693 }
694 }
695
nvme_req_clear(NvmeRequest * req)696 static void nvme_req_clear(NvmeRequest *req)
697 {
698 req->ns = NULL;
699 req->opaque = NULL;
700 req->aiocb = NULL;
701 memset(&req->cqe, 0x0, sizeof(req->cqe));
702 req->status = NVME_SUCCESS;
703 }
704
nvme_sg_init(NvmeCtrl * n,NvmeSg * sg,bool dma)705 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
706 {
707 if (dma) {
708 pci_dma_sglist_init(&sg->qsg, PCI_DEVICE(n), 0);
709 sg->flags = NVME_SG_DMA;
710 } else {
711 qemu_iovec_init(&sg->iov, 0);
712 }
713
714 sg->flags |= NVME_SG_ALLOC;
715 }
716
nvme_sg_unmap(NvmeSg * sg)717 static inline void nvme_sg_unmap(NvmeSg *sg)
718 {
719 if (!(sg->flags & NVME_SG_ALLOC)) {
720 return;
721 }
722
723 if (sg->flags & NVME_SG_DMA) {
724 qemu_sglist_destroy(&sg->qsg);
725 } else {
726 qemu_iovec_destroy(&sg->iov);
727 }
728
729 memset(sg, 0x0, sizeof(*sg));
730 }
731
732 /*
733 * When metadata is transferred as extended LBAs, the DPTR mapped into `sg`
734 * holds both data and metadata. This function splits the data and metadata
735 * into two separate QSG/IOVs.
736 */
nvme_sg_split(NvmeSg * sg,NvmeNamespace * ns,NvmeSg * data,NvmeSg * mdata)737 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
738 NvmeSg *mdata)
739 {
740 NvmeSg *dst = data;
741 uint32_t trans_len, count = ns->lbasz;
742 uint64_t offset = 0;
743 bool dma = sg->flags & NVME_SG_DMA;
744 size_t sge_len;
745 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
746 int sg_idx = 0;
747
748 assert(sg->flags & NVME_SG_ALLOC);
749
750 while (sg_len) {
751 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
752
753 trans_len = MIN(sg_len, count);
754 trans_len = MIN(trans_len, sge_len - offset);
755
756 if (dst) {
757 if (dma) {
758 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
759 trans_len);
760 } else {
761 qemu_iovec_add(&dst->iov,
762 sg->iov.iov[sg_idx].iov_base + offset,
763 trans_len);
764 }
765 }
766
767 sg_len -= trans_len;
768 count -= trans_len;
769 offset += trans_len;
770
771 if (count == 0) {
772 dst = (dst == data) ? mdata : data;
773 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
774 }
775
776 if (sge_len == offset) {
777 offset = 0;
778 sg_idx++;
779 }
780 }
781 }
782
nvme_map_addr_cmb(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)783 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
784 size_t len)
785 {
786 if (!len) {
787 return NVME_SUCCESS;
788 }
789
790 trace_pci_nvme_map_addr_cmb(addr, len);
791
792 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
793 return NVME_DATA_TRAS_ERROR;
794 }
795
796 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
797
798 return NVME_SUCCESS;
799 }
800
nvme_map_addr_pmr(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)801 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
802 size_t len)
803 {
804 if (!len) {
805 return NVME_SUCCESS;
806 }
807
808 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
809 return NVME_DATA_TRAS_ERROR;
810 }
811
812 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
813
814 return NVME_SUCCESS;
815 }
816
nvme_map_addr(NvmeCtrl * n,NvmeSg * sg,hwaddr addr,size_t len)817 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
818 {
819 bool cmb = false, pmr = false;
820
821 if (!len) {
822 return NVME_SUCCESS;
823 }
824
825 trace_pci_nvme_map_addr(addr, len);
826
827 if (nvme_addr_is_iomem(n, addr)) {
828 return NVME_DATA_TRAS_ERROR;
829 }
830
831 if (nvme_addr_is_cmb(n, addr)) {
832 cmb = true;
833 } else if (nvme_addr_is_pmr(n, addr)) {
834 pmr = true;
835 }
836
837 if (cmb || pmr) {
838 if (sg->flags & NVME_SG_DMA) {
839 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
840 }
841
842 if (sg->iov.niov + 1 > IOV_MAX) {
843 goto max_mappings_exceeded;
844 }
845
846 if (cmb) {
847 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
848 } else {
849 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
850 }
851 }
852
853 if (!(sg->flags & NVME_SG_DMA)) {
854 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
855 }
856
857 if (sg->qsg.nsg + 1 > IOV_MAX) {
858 goto max_mappings_exceeded;
859 }
860
861 qemu_sglist_add(&sg->qsg, addr, len);
862
863 return NVME_SUCCESS;
864
865 max_mappings_exceeded:
866 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
867 "number of mappings exceed 1024");
868 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
869 }
870
nvme_addr_is_dma(NvmeCtrl * n,hwaddr addr)871 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
872 {
873 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
874 }
875
nvme_map_prp(NvmeCtrl * n,NvmeSg * sg,uint64_t prp1,uint64_t prp2,uint32_t len)876 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
877 uint64_t prp2, uint32_t len)
878 {
879 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
880 trans_len = MIN(len, trans_len);
881 int num_prps = (len >> n->page_bits) + 1;
882 uint16_t status;
883 int ret;
884
885 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
886
887 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
888
889 status = nvme_map_addr(n, sg, prp1, trans_len);
890 if (status) {
891 goto unmap;
892 }
893
894 len -= trans_len;
895 if (len) {
896 if (len > n->page_size) {
897 g_autofree uint64_t *prp_list = g_new(uint64_t, n->max_prp_ents);
898 uint32_t nents, prp_trans;
899 int i = 0;
900
901 /*
902 * The first PRP list entry, pointed to by PRP2 may contain offset.
903 * Hence, we need to calculate the number of entries in based on
904 * that offset.
905 */
906 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
907 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
908 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
909 if (ret) {
910 trace_pci_nvme_err_addr_read(prp2);
911 status = NVME_DATA_TRAS_ERROR;
912 goto unmap;
913 }
914 while (len != 0) {
915 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
916
917 if (i == nents - 1 && len > n->page_size) {
918 if (unlikely(prp_ent & (n->page_size - 1))) {
919 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
920 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
921 goto unmap;
922 }
923
924 i = 0;
925 nents = (len + n->page_size - 1) >> n->page_bits;
926 nents = MIN(nents, n->max_prp_ents);
927 prp_trans = nents * sizeof(uint64_t);
928 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
929 prp_trans);
930 if (ret) {
931 trace_pci_nvme_err_addr_read(prp_ent);
932 status = NVME_DATA_TRAS_ERROR;
933 goto unmap;
934 }
935 prp_ent = le64_to_cpu(prp_list[i]);
936 }
937
938 if (unlikely(prp_ent & (n->page_size - 1))) {
939 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
940 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
941 goto unmap;
942 }
943
944 trans_len = MIN(len, n->page_size);
945 status = nvme_map_addr(n, sg, prp_ent, trans_len);
946 if (status) {
947 goto unmap;
948 }
949
950 len -= trans_len;
951 i++;
952 }
953 } else {
954 if (unlikely(prp2 & (n->page_size - 1))) {
955 trace_pci_nvme_err_invalid_prp2_align(prp2);
956 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
957 goto unmap;
958 }
959 status = nvme_map_addr(n, sg, prp2, len);
960 if (status) {
961 goto unmap;
962 }
963 }
964 }
965
966 return NVME_SUCCESS;
967
968 unmap:
969 nvme_sg_unmap(sg);
970 return status;
971 }
972
973 /*
974 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
975 * number of bytes mapped in len.
976 */
nvme_map_sgl_data(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor * segment,uint64_t nsgld,size_t * len,NvmeCmd * cmd)977 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
978 NvmeSglDescriptor *segment, uint64_t nsgld,
979 size_t *len, NvmeCmd *cmd)
980 {
981 dma_addr_t addr, trans_len;
982 uint32_t dlen;
983 uint16_t status;
984
985 for (int i = 0; i < nsgld; i++) {
986 uint8_t type = NVME_SGL_TYPE(segment[i].type);
987
988 switch (type) {
989 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
990 break;
991 case NVME_SGL_DESCR_TYPE_SEGMENT:
992 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
993 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
994 default:
995 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
996 }
997
998 dlen = le32_to_cpu(segment[i].len);
999
1000 if (!dlen) {
1001 continue;
1002 }
1003
1004 if (*len == 0) {
1005 /*
1006 * All data has been mapped, but the SGL contains additional
1007 * segments and/or descriptors. The controller might accept
1008 * ignoring the rest of the SGL.
1009 */
1010 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
1011 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
1012 break;
1013 }
1014
1015 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
1016 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1017 }
1018
1019 trans_len = MIN(*len, dlen);
1020
1021 addr = le64_to_cpu(segment[i].addr);
1022
1023 if (UINT64_MAX - addr < dlen) {
1024 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1025 }
1026
1027 status = nvme_map_addr(n, sg, addr, trans_len);
1028 if (status) {
1029 return status;
1030 }
1031
1032 *len -= trans_len;
1033 }
1034
1035 return NVME_SUCCESS;
1036 }
1037
nvme_map_sgl(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor sgl,size_t len,NvmeCmd * cmd)1038 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
1039 size_t len, NvmeCmd *cmd)
1040 {
1041 /*
1042 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
1043 * dynamically allocating a potentially huge SGL. The spec allows the SGL
1044 * to be larger (as in number of bytes required to describe the SGL
1045 * descriptors and segment chain) than the command transfer size, so it is
1046 * not bounded by MDTS.
1047 */
1048 #define SEG_CHUNK_SIZE 256
1049
1050 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
1051 uint64_t nsgld;
1052 uint32_t seg_len;
1053 uint16_t status;
1054 hwaddr addr;
1055 int ret;
1056
1057 sgld = &sgl;
1058 addr = le64_to_cpu(sgl.addr);
1059
1060 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
1061
1062 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
1063
1064 /*
1065 * If the entire transfer can be described with a single data block it can
1066 * be mapped directly.
1067 */
1068 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1069 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
1070 if (status) {
1071 goto unmap;
1072 }
1073
1074 goto out;
1075 }
1076
1077 for (;;) {
1078 switch (NVME_SGL_TYPE(sgld->type)) {
1079 case NVME_SGL_DESCR_TYPE_SEGMENT:
1080 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1081 break;
1082 default:
1083 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1084 }
1085
1086 seg_len = le32_to_cpu(sgld->len);
1087
1088 /* check the length of the (Last) Segment descriptor */
1089 if (!seg_len || seg_len & 0xf) {
1090 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1091 }
1092
1093 if (UINT64_MAX - addr < seg_len) {
1094 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1095 }
1096
1097 nsgld = seg_len / sizeof(NvmeSglDescriptor);
1098
1099 while (nsgld > SEG_CHUNK_SIZE) {
1100 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
1101 trace_pci_nvme_err_addr_read(addr);
1102 status = NVME_DATA_TRAS_ERROR;
1103 goto unmap;
1104 }
1105
1106 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
1107 &len, cmd);
1108 if (status) {
1109 goto unmap;
1110 }
1111
1112 nsgld -= SEG_CHUNK_SIZE;
1113 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
1114 }
1115
1116 ret = nvme_addr_read(n, addr, segment, nsgld *
1117 sizeof(NvmeSglDescriptor));
1118 if (ret) {
1119 trace_pci_nvme_err_addr_read(addr);
1120 status = NVME_DATA_TRAS_ERROR;
1121 goto unmap;
1122 }
1123
1124 last_sgld = &segment[nsgld - 1];
1125
1126 /*
1127 * If the segment ends with a Data Block, then we are done.
1128 */
1129 if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1130 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
1131 if (status) {
1132 goto unmap;
1133 }
1134
1135 goto out;
1136 }
1137
1138 /*
1139 * If the last descriptor was not a Data Block, then the current
1140 * segment must not be a Last Segment.
1141 */
1142 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
1143 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1144 goto unmap;
1145 }
1146
1147 sgld = last_sgld;
1148 addr = le64_to_cpu(sgld->addr);
1149
1150 /*
1151 * Do not map the last descriptor; it will be a Segment or Last Segment
1152 * descriptor and is handled by the next iteration.
1153 */
1154 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
1155 if (status) {
1156 goto unmap;
1157 }
1158 }
1159
1160 out:
1161 /* if there is any residual left in len, the SGL was too short */
1162 if (len) {
1163 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1164 goto unmap;
1165 }
1166
1167 return NVME_SUCCESS;
1168
1169 unmap:
1170 nvme_sg_unmap(sg);
1171 return status;
1172 }
1173
nvme_map_dptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1174 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1175 NvmeCmd *cmd)
1176 {
1177 uint64_t prp1, prp2;
1178
1179 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
1180 case NVME_PSDT_PRP:
1181 prp1 = le64_to_cpu(cmd->dptr.prp1);
1182 prp2 = le64_to_cpu(cmd->dptr.prp2);
1183
1184 return nvme_map_prp(n, sg, prp1, prp2, len);
1185 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
1186 case NVME_PSDT_SGL_MPTR_SGL:
1187 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
1188 default:
1189 return NVME_INVALID_FIELD;
1190 }
1191 }
1192
nvme_map_mptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1193 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1194 NvmeCmd *cmd)
1195 {
1196 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
1197 hwaddr mptr = le64_to_cpu(cmd->mptr);
1198 uint16_t status;
1199
1200 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1201 NvmeSglDescriptor sgl;
1202
1203 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1204 return NVME_DATA_TRAS_ERROR;
1205 }
1206
1207 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1208 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1209 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1210 }
1211
1212 return status;
1213 }
1214
1215 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1216 status = nvme_map_addr(n, sg, mptr, len);
1217 if (status) {
1218 nvme_sg_unmap(sg);
1219 }
1220
1221 return status;
1222 }
1223
nvme_map_data(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1224 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1225 {
1226 NvmeNamespace *ns = req->ns;
1227 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1228 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1229 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1230 size_t len = nvme_l2b(ns, nlb);
1231 uint16_t status;
1232
1233 if (nvme_ns_ext(ns) &&
1234 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1235 NvmeSg sg;
1236
1237 len += nvme_m2b(ns, nlb);
1238
1239 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1240 if (status) {
1241 return status;
1242 }
1243
1244 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1245 nvme_sg_split(&sg, ns, &req->sg, NULL);
1246 nvme_sg_unmap(&sg);
1247
1248 return NVME_SUCCESS;
1249 }
1250
1251 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1252 }
1253
nvme_map_mdata(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1254 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1255 {
1256 NvmeNamespace *ns = req->ns;
1257 size_t len = nvme_m2b(ns, nlb);
1258 uint16_t status;
1259
1260 if (nvme_ns_ext(ns)) {
1261 NvmeSg sg;
1262
1263 len += nvme_l2b(ns, nlb);
1264
1265 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1266 if (status) {
1267 return status;
1268 }
1269
1270 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1271 nvme_sg_split(&sg, ns, NULL, &req->sg);
1272 nvme_sg_unmap(&sg);
1273
1274 return NVME_SUCCESS;
1275 }
1276
1277 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1278 }
1279
nvme_tx_interleaved(NvmeCtrl * n,NvmeSg * sg,uint8_t * ptr,uint32_t len,uint32_t bytes,int32_t skip_bytes,int64_t offset,NvmeTxDirection dir)1280 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1281 uint32_t len, uint32_t bytes,
1282 int32_t skip_bytes, int64_t offset,
1283 NvmeTxDirection dir)
1284 {
1285 hwaddr addr;
1286 uint32_t trans_len, count = bytes;
1287 bool dma = sg->flags & NVME_SG_DMA;
1288 int64_t sge_len;
1289 int sg_idx = 0;
1290 int ret;
1291
1292 assert(sg->flags & NVME_SG_ALLOC);
1293
1294 while (len) {
1295 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1296
1297 if (sge_len - offset < 0) {
1298 offset -= sge_len;
1299 sg_idx++;
1300 continue;
1301 }
1302
1303 if (sge_len == offset) {
1304 offset = 0;
1305 sg_idx++;
1306 continue;
1307 }
1308
1309 trans_len = MIN(len, count);
1310 trans_len = MIN(trans_len, sge_len - offset);
1311
1312 if (dma) {
1313 addr = sg->qsg.sg[sg_idx].base + offset;
1314 } else {
1315 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1316 }
1317
1318 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1319 ret = nvme_addr_read(n, addr, ptr, trans_len);
1320 } else {
1321 ret = nvme_addr_write(n, addr, ptr, trans_len);
1322 }
1323
1324 if (ret) {
1325 return NVME_DATA_TRAS_ERROR;
1326 }
1327
1328 ptr += trans_len;
1329 len -= trans_len;
1330 count -= trans_len;
1331 offset += trans_len;
1332
1333 if (count == 0) {
1334 count = bytes;
1335 offset += skip_bytes;
1336 }
1337 }
1338
1339 return NVME_SUCCESS;
1340 }
1341
nvme_tx(NvmeCtrl * n,NvmeSg * sg,void * ptr,uint32_t len,NvmeTxDirection dir)1342 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
1343 NvmeTxDirection dir)
1344 {
1345 assert(sg->flags & NVME_SG_ALLOC);
1346
1347 if (sg->flags & NVME_SG_DMA) {
1348 const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1349 dma_addr_t residual;
1350
1351 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1352 dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1353 } else {
1354 dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1355 }
1356
1357 if (unlikely(residual)) {
1358 trace_pci_nvme_err_invalid_dma();
1359 return NVME_INVALID_FIELD | NVME_DNR;
1360 }
1361 } else {
1362 size_t bytes;
1363
1364 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1365 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1366 } else {
1367 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1368 }
1369
1370 if (unlikely(bytes != len)) {
1371 trace_pci_nvme_err_invalid_dma();
1372 return NVME_INVALID_FIELD | NVME_DNR;
1373 }
1374 }
1375
1376 return NVME_SUCCESS;
1377 }
1378
nvme_c2h(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1379 static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
1380 NvmeRequest *req)
1381 {
1382 uint16_t status;
1383
1384 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1385 if (status) {
1386 return status;
1387 }
1388
1389 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1390 }
1391
nvme_h2c(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1392 static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
1393 NvmeRequest *req)
1394 {
1395 uint16_t status;
1396
1397 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1398 if (status) {
1399 return status;
1400 }
1401
1402 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1403 }
1404
nvme_bounce_data(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1405 uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
1406 NvmeTxDirection dir, NvmeRequest *req)
1407 {
1408 NvmeNamespace *ns = req->ns;
1409 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1410 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1411 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1412
1413 if (nvme_ns_ext(ns) &&
1414 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1415 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1416 ns->lbaf.ms, 0, dir);
1417 }
1418
1419 return nvme_tx(n, &req->sg, ptr, len, dir);
1420 }
1421
nvme_bounce_mdata(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1422 uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
1423 NvmeTxDirection dir, NvmeRequest *req)
1424 {
1425 NvmeNamespace *ns = req->ns;
1426 uint16_t status;
1427
1428 if (nvme_ns_ext(ns)) {
1429 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1430 ns->lbasz, ns->lbasz, dir);
1431 }
1432
1433 nvme_sg_unmap(&req->sg);
1434
1435 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1436 if (status) {
1437 return status;
1438 }
1439
1440 return nvme_tx(n, &req->sg, ptr, len, dir);
1441 }
1442
nvme_blk_read(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1443 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1444 uint32_t align, BlockCompletionFunc *cb,
1445 NvmeRequest *req)
1446 {
1447 assert(req->sg.flags & NVME_SG_ALLOC);
1448
1449 if (req->sg.flags & NVME_SG_DMA) {
1450 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req);
1451 } else {
1452 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1453 }
1454 }
1455
nvme_blk_write(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1456 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1457 uint32_t align, BlockCompletionFunc *cb,
1458 NvmeRequest *req)
1459 {
1460 assert(req->sg.flags & NVME_SG_ALLOC);
1461
1462 if (req->sg.flags & NVME_SG_DMA) {
1463 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req);
1464 } else {
1465 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1466 }
1467 }
1468
nvme_update_cq_eventidx(const NvmeCQueue * cq)1469 static void nvme_update_cq_eventidx(const NvmeCQueue *cq)
1470 {
1471 trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head);
1472
1473 stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head,
1474 MEMTXATTRS_UNSPECIFIED);
1475 }
1476
nvme_update_cq_head(NvmeCQueue * cq)1477 static void nvme_update_cq_head(NvmeCQueue *cq)
1478 {
1479 ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, &cq->head,
1480 MEMTXATTRS_UNSPECIFIED);
1481
1482 trace_pci_nvme_update_cq_head(cq->cqid, cq->head);
1483 }
1484
nvme_post_cqes(void * opaque)1485 static void nvme_post_cqes(void *opaque)
1486 {
1487 NvmeCQueue *cq = opaque;
1488 NvmeCtrl *n = cq->ctrl;
1489 NvmeRequest *req, *next;
1490 bool pending = cq->head != cq->tail;
1491 int ret;
1492
1493 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1494 NvmeSQueue *sq;
1495 hwaddr addr;
1496
1497 if (n->dbbuf_enabled) {
1498 nvme_update_cq_eventidx(cq);
1499 nvme_update_cq_head(cq);
1500 }
1501
1502 if (nvme_cq_full(cq)) {
1503 break;
1504 }
1505
1506 sq = req->sq;
1507 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1508 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1509 req->cqe.sq_head = cpu_to_le16(sq->head);
1510 addr = cq->dma_addr + (cq->tail << NVME_CQES);
1511 ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)&req->cqe,
1512 sizeof(req->cqe));
1513 if (ret) {
1514 trace_pci_nvme_err_addr_write(addr);
1515 trace_pci_nvme_err_cfs();
1516 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1517 break;
1518 }
1519
1520 QTAILQ_REMOVE(&cq->req_list, req, entry);
1521
1522 nvme_inc_cq_tail(cq);
1523 nvme_sg_unmap(&req->sg);
1524
1525 if (QTAILQ_EMPTY(&sq->req_list) && !nvme_sq_empty(sq)) {
1526 qemu_bh_schedule(sq->bh);
1527 }
1528
1529 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1530 }
1531 if (cq->tail != cq->head) {
1532 if (cq->irq_enabled && !pending) {
1533 n->cq_pending++;
1534 }
1535
1536 nvme_irq_assert(n, cq);
1537 }
1538 }
1539
nvme_enqueue_req_completion(NvmeCQueue * cq,NvmeRequest * req)1540 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1541 {
1542 assert(cq->cqid == req->sq->cqid);
1543 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1544 le32_to_cpu(req->cqe.result),
1545 le32_to_cpu(req->cqe.dw1),
1546 req->status);
1547
1548 if (req->status) {
1549 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1550 req->status, req->cmd.opcode);
1551 }
1552
1553 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1554 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1555
1556 qemu_bh_schedule(cq->bh);
1557 }
1558
nvme_process_aers(void * opaque)1559 static void nvme_process_aers(void *opaque)
1560 {
1561 NvmeCtrl *n = opaque;
1562 NvmeAsyncEvent *event, *next;
1563
1564 trace_pci_nvme_process_aers(n->aer_queued);
1565
1566 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1567 NvmeRequest *req;
1568 NvmeAerResult *result;
1569
1570 /* can't post cqe if there is nothing to complete */
1571 if (!n->outstanding_aers) {
1572 trace_pci_nvme_no_outstanding_aers();
1573 break;
1574 }
1575
1576 /* ignore if masked (cqe posted, but event not cleared) */
1577 if (n->aer_mask & (1 << event->result.event_type)) {
1578 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1579 continue;
1580 }
1581
1582 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1583 n->aer_queued--;
1584
1585 n->aer_mask |= 1 << event->result.event_type;
1586 n->outstanding_aers--;
1587
1588 req = n->aer_reqs[n->outstanding_aers];
1589
1590 result = (NvmeAerResult *) &req->cqe.result;
1591 result->event_type = event->result.event_type;
1592 result->event_info = event->result.event_info;
1593 result->log_page = event->result.log_page;
1594 g_free(event);
1595
1596 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1597 result->log_page);
1598
1599 nvme_enqueue_req_completion(&n->admin_cq, req);
1600 }
1601 }
1602
nvme_enqueue_event(NvmeCtrl * n,uint8_t event_type,uint8_t event_info,uint8_t log_page)1603 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1604 uint8_t event_info, uint8_t log_page)
1605 {
1606 NvmeAsyncEvent *event;
1607
1608 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1609
1610 if (n->aer_queued == n->params.aer_max_queued) {
1611 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1612 return;
1613 }
1614
1615 event = g_new(NvmeAsyncEvent, 1);
1616 event->result = (NvmeAerResult) {
1617 .event_type = event_type,
1618 .event_info = event_info,
1619 .log_page = log_page,
1620 };
1621
1622 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1623 n->aer_queued++;
1624
1625 nvme_process_aers(n);
1626 }
1627
nvme_smart_event(NvmeCtrl * n,uint8_t event)1628 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1629 {
1630 uint8_t aer_info;
1631
1632 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1633 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1634 return;
1635 }
1636
1637 switch (event) {
1638 case NVME_SMART_SPARE:
1639 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1640 break;
1641 case NVME_SMART_TEMPERATURE:
1642 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1643 break;
1644 case NVME_SMART_RELIABILITY:
1645 case NVME_SMART_MEDIA_READ_ONLY:
1646 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1647 case NVME_SMART_PMR_UNRELIABLE:
1648 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1649 break;
1650 default:
1651 return;
1652 }
1653
1654 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1655 }
1656
nvme_clear_events(NvmeCtrl * n,uint8_t event_type)1657 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1658 {
1659 n->aer_mask &= ~(1 << event_type);
1660 if (!QTAILQ_EMPTY(&n->aer_queue)) {
1661 nvme_process_aers(n);
1662 }
1663 }
1664
nvme_check_mdts(NvmeCtrl * n,size_t len)1665 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1666 {
1667 uint8_t mdts = n->params.mdts;
1668
1669 if (mdts && len > n->page_size << mdts) {
1670 trace_pci_nvme_err_mdts(len);
1671 return NVME_INVALID_FIELD | NVME_DNR;
1672 }
1673
1674 return NVME_SUCCESS;
1675 }
1676
nvme_check_bounds(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1677 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1678 uint32_t nlb)
1679 {
1680 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1681
1682 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1683 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1684 return NVME_LBA_RANGE | NVME_DNR;
1685 }
1686
1687 return NVME_SUCCESS;
1688 }
1689
nvme_block_status_all(NvmeNamespace * ns,uint64_t slba,uint32_t nlb,int flags)1690 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1691 uint32_t nlb, int flags)
1692 {
1693 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1694
1695 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1696 int64_t offset = nvme_l2b(ns, slba);
1697 int ret;
1698
1699 /*
1700 * `pnum` holds the number of bytes after offset that shares the same
1701 * allocation status as the byte at offset. If `pnum` is different from
1702 * `bytes`, we should check the allocation status of the next range and
1703 * continue this until all bytes have been checked.
1704 */
1705 do {
1706 bytes -= pnum;
1707
1708 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1709 if (ret < 0) {
1710 return ret;
1711 }
1712
1713
1714 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1715 !!(ret & BDRV_BLOCK_ZERO));
1716
1717 if (!(ret & flags)) {
1718 return 1;
1719 }
1720
1721 offset += pnum;
1722 } while (pnum != bytes);
1723
1724 return 0;
1725 }
1726
nvme_check_dulbe(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1727 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1728 uint32_t nlb)
1729 {
1730 int ret;
1731 Error *err = NULL;
1732
1733 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1734 if (ret) {
1735 if (ret < 0) {
1736 error_setg_errno(&err, -ret, "unable to get block status");
1737 error_report_err(err);
1738
1739 return NVME_INTERNAL_DEV_ERROR;
1740 }
1741
1742 return NVME_DULB;
1743 }
1744
1745 return NVME_SUCCESS;
1746 }
1747
nvme_aio_err(NvmeRequest * req,int ret)1748 static void nvme_aio_err(NvmeRequest *req, int ret)
1749 {
1750 uint16_t status = NVME_SUCCESS;
1751 Error *local_err = NULL;
1752
1753 switch (req->cmd.opcode) {
1754 case NVME_CMD_READ:
1755 status = NVME_UNRECOVERED_READ;
1756 break;
1757 case NVME_CMD_FLUSH:
1758 case NVME_CMD_WRITE:
1759 case NVME_CMD_WRITE_ZEROES:
1760 case NVME_CMD_ZONE_APPEND:
1761 case NVME_CMD_COPY:
1762 status = NVME_WRITE_FAULT;
1763 break;
1764 default:
1765 status = NVME_INTERNAL_DEV_ERROR;
1766 break;
1767 }
1768
1769 if (ret == -ECANCELED) {
1770 status = NVME_CMD_ABORT_REQ;
1771 }
1772
1773 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1774
1775 error_setg_errno(&local_err, -ret, "aio failed");
1776 error_report_err(local_err);
1777
1778 /*
1779 * Set the command status code to the first encountered error but allow a
1780 * subsequent Internal Device Error to trump it.
1781 */
1782 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1783 return;
1784 }
1785
1786 req->status = status;
1787 }
1788
nvme_zone_idx(NvmeNamespace * ns,uint64_t slba)1789 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1790 {
1791 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1792 slba / ns->zone_size;
1793 }
1794
nvme_get_zone_by_slba(NvmeNamespace * ns,uint64_t slba)1795 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1796 {
1797 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1798
1799 if (zone_idx >= ns->num_zones) {
1800 return NULL;
1801 }
1802
1803 return &ns->zone_array[zone_idx];
1804 }
1805
nvme_check_zone_state_for_write(NvmeZone * zone)1806 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1807 {
1808 uint64_t zslba = zone->d.zslba;
1809
1810 switch (nvme_get_zone_state(zone)) {
1811 case NVME_ZONE_STATE_EMPTY:
1812 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1813 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1814 case NVME_ZONE_STATE_CLOSED:
1815 return NVME_SUCCESS;
1816 case NVME_ZONE_STATE_FULL:
1817 trace_pci_nvme_err_zone_is_full(zslba);
1818 return NVME_ZONE_FULL;
1819 case NVME_ZONE_STATE_OFFLINE:
1820 trace_pci_nvme_err_zone_is_offline(zslba);
1821 return NVME_ZONE_OFFLINE;
1822 case NVME_ZONE_STATE_READ_ONLY:
1823 trace_pci_nvme_err_zone_is_read_only(zslba);
1824 return NVME_ZONE_READ_ONLY;
1825 default:
1826 assert(false);
1827 }
1828
1829 return NVME_INTERNAL_DEV_ERROR;
1830 }
1831
nvme_check_zone_write(NvmeNamespace * ns,NvmeZone * zone,uint64_t slba,uint32_t nlb)1832 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1833 uint64_t slba, uint32_t nlb)
1834 {
1835 uint64_t zcap = nvme_zone_wr_boundary(zone);
1836 uint16_t status;
1837
1838 status = nvme_check_zone_state_for_write(zone);
1839 if (status) {
1840 return status;
1841 }
1842
1843 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1844 uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
1845
1846 if (slba < zone->w_ptr || slba + nlb > ezrwa) {
1847 trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
1848 return NVME_ZONE_INVALID_WRITE;
1849 }
1850 } else {
1851 if (unlikely(slba != zone->w_ptr)) {
1852 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
1853 zone->w_ptr);
1854 return NVME_ZONE_INVALID_WRITE;
1855 }
1856 }
1857
1858 if (unlikely((slba + nlb) > zcap)) {
1859 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1860 return NVME_ZONE_BOUNDARY_ERROR;
1861 }
1862
1863 return NVME_SUCCESS;
1864 }
1865
nvme_check_zone_state_for_read(NvmeZone * zone)1866 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1867 {
1868 switch (nvme_get_zone_state(zone)) {
1869 case NVME_ZONE_STATE_EMPTY:
1870 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1871 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1872 case NVME_ZONE_STATE_FULL:
1873 case NVME_ZONE_STATE_CLOSED:
1874 case NVME_ZONE_STATE_READ_ONLY:
1875 return NVME_SUCCESS;
1876 case NVME_ZONE_STATE_OFFLINE:
1877 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1878 return NVME_ZONE_OFFLINE;
1879 default:
1880 assert(false);
1881 }
1882
1883 return NVME_INTERNAL_DEV_ERROR;
1884 }
1885
nvme_check_zone_read(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1886 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1887 uint32_t nlb)
1888 {
1889 NvmeZone *zone;
1890 uint64_t bndry, end;
1891 uint16_t status;
1892
1893 zone = nvme_get_zone_by_slba(ns, slba);
1894 assert(zone);
1895
1896 bndry = nvme_zone_rd_boundary(ns, zone);
1897 end = slba + nlb;
1898
1899 status = nvme_check_zone_state_for_read(zone);
1900 if (status) {
1901 ;
1902 } else if (unlikely(end > bndry)) {
1903 if (!ns->params.cross_zone_read) {
1904 status = NVME_ZONE_BOUNDARY_ERROR;
1905 } else {
1906 /*
1907 * Read across zone boundary - check that all subsequent
1908 * zones that are being read have an appropriate state.
1909 */
1910 do {
1911 zone++;
1912 status = nvme_check_zone_state_for_read(zone);
1913 if (status) {
1914 break;
1915 }
1916 } while (end > nvme_zone_rd_boundary(ns, zone));
1917 }
1918 }
1919
1920 return status;
1921 }
1922
nvme_zrm_finish(NvmeNamespace * ns,NvmeZone * zone)1923 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1924 {
1925 switch (nvme_get_zone_state(zone)) {
1926 case NVME_ZONE_STATE_FULL:
1927 return NVME_SUCCESS;
1928
1929 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1930 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1931 nvme_aor_dec_open(ns);
1932 /* fallthrough */
1933 case NVME_ZONE_STATE_CLOSED:
1934 nvme_aor_dec_active(ns);
1935
1936 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1937 zone->d.za &= ~NVME_ZA_ZRWA_VALID;
1938 if (ns->params.numzrwa) {
1939 ns->zns.numzrwa++;
1940 }
1941 }
1942
1943 /* fallthrough */
1944 case NVME_ZONE_STATE_EMPTY:
1945 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1946 return NVME_SUCCESS;
1947
1948 default:
1949 return NVME_ZONE_INVAL_TRANSITION;
1950 }
1951 }
1952
nvme_zrm_close(NvmeNamespace * ns,NvmeZone * zone)1953 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1954 {
1955 switch (nvme_get_zone_state(zone)) {
1956 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1957 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1958 nvme_aor_dec_open(ns);
1959 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1960 /* fall through */
1961 case NVME_ZONE_STATE_CLOSED:
1962 return NVME_SUCCESS;
1963
1964 default:
1965 return NVME_ZONE_INVAL_TRANSITION;
1966 }
1967 }
1968
nvme_zrm_reset(NvmeNamespace * ns,NvmeZone * zone)1969 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1970 {
1971 switch (nvme_get_zone_state(zone)) {
1972 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1973 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1974 nvme_aor_dec_open(ns);
1975 /* fallthrough */
1976 case NVME_ZONE_STATE_CLOSED:
1977 nvme_aor_dec_active(ns);
1978
1979 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1980 if (ns->params.numzrwa) {
1981 ns->zns.numzrwa++;
1982 }
1983 }
1984
1985 /* fallthrough */
1986 case NVME_ZONE_STATE_FULL:
1987 zone->w_ptr = zone->d.zslba;
1988 zone->d.wp = zone->w_ptr;
1989 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1990 /* fallthrough */
1991 case NVME_ZONE_STATE_EMPTY:
1992 return NVME_SUCCESS;
1993
1994 default:
1995 return NVME_ZONE_INVAL_TRANSITION;
1996 }
1997 }
1998
nvme_zrm_auto_transition_zone(NvmeNamespace * ns)1999 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
2000 {
2001 NvmeZone *zone;
2002
2003 if (ns->params.max_open_zones &&
2004 ns->nr_open_zones == ns->params.max_open_zones) {
2005 zone = QTAILQ_FIRST(&ns->imp_open_zones);
2006 if (zone) {
2007 /*
2008 * Automatically close this implicitly open zone.
2009 */
2010 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
2011 nvme_zrm_close(ns, zone);
2012 }
2013 }
2014 }
2015
2016 enum {
2017 NVME_ZRM_AUTO = 1 << 0,
2018 NVME_ZRM_ZRWA = 1 << 1,
2019 };
2020
nvme_zrm_open_flags(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone,int flags)2021 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
2022 NvmeZone *zone, int flags)
2023 {
2024 int act = 0;
2025 uint16_t status;
2026
2027 switch (nvme_get_zone_state(zone)) {
2028 case NVME_ZONE_STATE_EMPTY:
2029 act = 1;
2030
2031 /* fallthrough */
2032
2033 case NVME_ZONE_STATE_CLOSED:
2034 if (n->params.auto_transition_zones) {
2035 nvme_zrm_auto_transition_zone(ns);
2036 }
2037 status = nvme_zns_check_resources(ns, act, 1,
2038 (flags & NVME_ZRM_ZRWA) ? 1 : 0);
2039 if (status) {
2040 return status;
2041 }
2042
2043 if (act) {
2044 nvme_aor_inc_active(ns);
2045 }
2046
2047 nvme_aor_inc_open(ns);
2048
2049 if (flags & NVME_ZRM_AUTO) {
2050 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
2051 return NVME_SUCCESS;
2052 }
2053
2054 /* fallthrough */
2055
2056 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2057 if (flags & NVME_ZRM_AUTO) {
2058 return NVME_SUCCESS;
2059 }
2060
2061 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
2062
2063 /* fallthrough */
2064
2065 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2066 if (flags & NVME_ZRM_ZRWA) {
2067 ns->zns.numzrwa--;
2068
2069 zone->d.za |= NVME_ZA_ZRWA_VALID;
2070 }
2071
2072 return NVME_SUCCESS;
2073
2074 default:
2075 return NVME_ZONE_INVAL_TRANSITION;
2076 }
2077 }
2078
nvme_zrm_auto(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone)2079 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
2080 NvmeZone *zone)
2081 {
2082 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
2083 }
2084
nvme_advance_zone_wp(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlb)2085 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
2086 uint32_t nlb)
2087 {
2088 zone->d.wp += nlb;
2089
2090 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
2091 nvme_zrm_finish(ns, zone);
2092 }
2093 }
2094
nvme_zoned_zrwa_implicit_flush(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlbc)2095 static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone,
2096 uint32_t nlbc)
2097 {
2098 uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg);
2099
2100 nlbc = nzrwafgs * ns->zns.zrwafg;
2101
2102 trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc);
2103
2104 zone->w_ptr += nlbc;
2105
2106 nvme_advance_zone_wp(ns, zone, nlbc);
2107 }
2108
nvme_finalize_zoned_write(NvmeNamespace * ns,NvmeRequest * req)2109 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
2110 {
2111 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2112 NvmeZone *zone;
2113 uint64_t slba;
2114 uint32_t nlb;
2115
2116 slba = le64_to_cpu(rw->slba);
2117 nlb = le16_to_cpu(rw->nlb) + 1;
2118 zone = nvme_get_zone_by_slba(ns, slba);
2119 assert(zone);
2120
2121 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
2122 uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1;
2123 uint64_t elba = slba + nlb - 1;
2124
2125 if (elba > ezrwa) {
2126 nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa);
2127 }
2128
2129 return;
2130 }
2131
2132 nvme_advance_zone_wp(ns, zone, nlb);
2133 }
2134
nvme_is_write(NvmeRequest * req)2135 static inline bool nvme_is_write(NvmeRequest *req)
2136 {
2137 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2138
2139 return rw->opcode == NVME_CMD_WRITE ||
2140 rw->opcode == NVME_CMD_ZONE_APPEND ||
2141 rw->opcode == NVME_CMD_WRITE_ZEROES;
2142 }
2143
nvme_misc_cb(void * opaque,int ret)2144 static void nvme_misc_cb(void *opaque, int ret)
2145 {
2146 NvmeRequest *req = opaque;
2147
2148 trace_pci_nvme_misc_cb(nvme_cid(req));
2149
2150 if (ret) {
2151 nvme_aio_err(req, ret);
2152 }
2153
2154 nvme_enqueue_req_completion(nvme_cq(req), req);
2155 }
2156
nvme_rw_complete_cb(void * opaque,int ret)2157 void nvme_rw_complete_cb(void *opaque, int ret)
2158 {
2159 NvmeRequest *req = opaque;
2160 NvmeNamespace *ns = req->ns;
2161 BlockBackend *blk = ns->blkconf.blk;
2162 BlockAcctCookie *acct = &req->acct;
2163 BlockAcctStats *stats = blk_get_stats(blk);
2164
2165 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
2166
2167 if (ret) {
2168 block_acct_failed(stats, acct);
2169 nvme_aio_err(req, ret);
2170 } else {
2171 block_acct_done(stats, acct);
2172 }
2173
2174 if (ns->params.zoned && nvme_is_write(req)) {
2175 nvme_finalize_zoned_write(ns, req);
2176 }
2177
2178 nvme_enqueue_req_completion(nvme_cq(req), req);
2179 }
2180
nvme_rw_cb(void * opaque,int ret)2181 static void nvme_rw_cb(void *opaque, int ret)
2182 {
2183 NvmeRequest *req = opaque;
2184 NvmeNamespace *ns = req->ns;
2185
2186 BlockBackend *blk = ns->blkconf.blk;
2187
2188 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
2189
2190 if (ret) {
2191 goto out;
2192 }
2193
2194 if (ns->lbaf.ms) {
2195 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2196 uint64_t slba = le64_to_cpu(rw->slba);
2197 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2198 uint64_t offset = nvme_moff(ns, slba);
2199
2200 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
2201 size_t mlen = nvme_m2b(ns, nlb);
2202
2203 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
2204 BDRV_REQ_MAY_UNMAP,
2205 nvme_rw_complete_cb, req);
2206 return;
2207 }
2208
2209 if (nvme_ns_ext(ns) || req->cmd.mptr) {
2210 uint16_t status;
2211
2212 nvme_sg_unmap(&req->sg);
2213 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
2214 if (status) {
2215 ret = -EFAULT;
2216 goto out;
2217 }
2218
2219 if (req->cmd.opcode == NVME_CMD_READ) {
2220 return nvme_blk_read(blk, offset, 1, nvme_rw_complete_cb, req);
2221 }
2222
2223 return nvme_blk_write(blk, offset, 1, nvme_rw_complete_cb, req);
2224 }
2225 }
2226
2227 out:
2228 nvme_rw_complete_cb(req, ret);
2229 }
2230
nvme_verify_cb(void * opaque,int ret)2231 static void nvme_verify_cb(void *opaque, int ret)
2232 {
2233 NvmeBounceContext *ctx = opaque;
2234 NvmeRequest *req = ctx->req;
2235 NvmeNamespace *ns = req->ns;
2236 BlockBackend *blk = ns->blkconf.blk;
2237 BlockAcctCookie *acct = &req->acct;
2238 BlockAcctStats *stats = blk_get_stats(blk);
2239 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2240 uint64_t slba = le64_to_cpu(rw->slba);
2241 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2242 uint16_t apptag = le16_to_cpu(rw->apptag);
2243 uint16_t appmask = le16_to_cpu(rw->appmask);
2244 uint64_t reftag = le32_to_cpu(rw->reftag);
2245 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2246 uint16_t status;
2247
2248 reftag |= cdw3 << 32;
2249
2250 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
2251
2252 if (ret) {
2253 block_acct_failed(stats, acct);
2254 nvme_aio_err(req, ret);
2255 goto out;
2256 }
2257
2258 block_acct_done(stats, acct);
2259
2260 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2261 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
2262 ctx->mdata.iov.size, slba);
2263 if (status) {
2264 req->status = status;
2265 goto out;
2266 }
2267
2268 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2269 ctx->mdata.bounce, ctx->mdata.iov.size,
2270 prinfo, slba, apptag, appmask, &reftag);
2271 }
2272
2273 out:
2274 qemu_iovec_destroy(&ctx->data.iov);
2275 g_free(ctx->data.bounce);
2276
2277 qemu_iovec_destroy(&ctx->mdata.iov);
2278 g_free(ctx->mdata.bounce);
2279
2280 g_free(ctx);
2281
2282 nvme_enqueue_req_completion(nvme_cq(req), req);
2283 }
2284
2285
nvme_verify_mdata_in_cb(void * opaque,int ret)2286 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2287 {
2288 NvmeBounceContext *ctx = opaque;
2289 NvmeRequest *req = ctx->req;
2290 NvmeNamespace *ns = req->ns;
2291 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2292 uint64_t slba = le64_to_cpu(rw->slba);
2293 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2294 size_t mlen = nvme_m2b(ns, nlb);
2295 uint64_t offset = nvme_moff(ns, slba);
2296 BlockBackend *blk = ns->blkconf.blk;
2297
2298 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2299
2300 if (ret) {
2301 goto out;
2302 }
2303
2304 ctx->mdata.bounce = g_malloc(mlen);
2305
2306 qemu_iovec_reset(&ctx->mdata.iov);
2307 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2308
2309 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2310 nvme_verify_cb, ctx);
2311 return;
2312
2313 out:
2314 nvme_verify_cb(ctx, ret);
2315 }
2316
2317 struct nvme_compare_ctx {
2318 struct {
2319 QEMUIOVector iov;
2320 uint8_t *bounce;
2321 } data;
2322
2323 struct {
2324 QEMUIOVector iov;
2325 uint8_t *bounce;
2326 } mdata;
2327 };
2328
nvme_compare_mdata_cb(void * opaque,int ret)2329 static void nvme_compare_mdata_cb(void *opaque, int ret)
2330 {
2331 NvmeRequest *req = opaque;
2332 NvmeNamespace *ns = req->ns;
2333 NvmeCtrl *n = nvme_ctrl(req);
2334 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2335 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2336 uint16_t apptag = le16_to_cpu(rw->apptag);
2337 uint16_t appmask = le16_to_cpu(rw->appmask);
2338 uint64_t reftag = le32_to_cpu(rw->reftag);
2339 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2340 struct nvme_compare_ctx *ctx = req->opaque;
2341 g_autofree uint8_t *buf = NULL;
2342 BlockBackend *blk = ns->blkconf.blk;
2343 BlockAcctCookie *acct = &req->acct;
2344 BlockAcctStats *stats = blk_get_stats(blk);
2345 uint16_t status = NVME_SUCCESS;
2346
2347 reftag |= cdw3 << 32;
2348
2349 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2350
2351 if (ret) {
2352 block_acct_failed(stats, acct);
2353 nvme_aio_err(req, ret);
2354 goto out;
2355 }
2356
2357 buf = g_malloc(ctx->mdata.iov.size);
2358
2359 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2360 NVME_TX_DIRECTION_TO_DEVICE, req);
2361 if (status) {
2362 req->status = status;
2363 goto out;
2364 }
2365
2366 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2367 uint64_t slba = le64_to_cpu(rw->slba);
2368 uint8_t *bufp;
2369 uint8_t *mbufp = ctx->mdata.bounce;
2370 uint8_t *end = mbufp + ctx->mdata.iov.size;
2371 int16_t pil = 0;
2372
2373 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2374 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2375 slba, apptag, appmask, &reftag);
2376 if (status) {
2377 req->status = status;
2378 goto out;
2379 }
2380
2381 /*
2382 * When formatted with protection information, do not compare the DIF
2383 * tuple.
2384 */
2385 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2386 pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
2387 }
2388
2389 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2390 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2391 req->status = NVME_CMP_FAILURE | NVME_DNR;
2392 goto out;
2393 }
2394 }
2395
2396 goto out;
2397 }
2398
2399 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2400 req->status = NVME_CMP_FAILURE | NVME_DNR;
2401 goto out;
2402 }
2403
2404 block_acct_done(stats, acct);
2405
2406 out:
2407 qemu_iovec_destroy(&ctx->data.iov);
2408 g_free(ctx->data.bounce);
2409
2410 qemu_iovec_destroy(&ctx->mdata.iov);
2411 g_free(ctx->mdata.bounce);
2412
2413 g_free(ctx);
2414
2415 nvme_enqueue_req_completion(nvme_cq(req), req);
2416 }
2417
nvme_compare_data_cb(void * opaque,int ret)2418 static void nvme_compare_data_cb(void *opaque, int ret)
2419 {
2420 NvmeRequest *req = opaque;
2421 NvmeCtrl *n = nvme_ctrl(req);
2422 NvmeNamespace *ns = req->ns;
2423 BlockBackend *blk = ns->blkconf.blk;
2424 BlockAcctCookie *acct = &req->acct;
2425 BlockAcctStats *stats = blk_get_stats(blk);
2426
2427 struct nvme_compare_ctx *ctx = req->opaque;
2428 g_autofree uint8_t *buf = NULL;
2429 uint16_t status;
2430
2431 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2432
2433 if (ret) {
2434 block_acct_failed(stats, acct);
2435 nvme_aio_err(req, ret);
2436 goto out;
2437 }
2438
2439 buf = g_malloc(ctx->data.iov.size);
2440
2441 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2442 NVME_TX_DIRECTION_TO_DEVICE, req);
2443 if (status) {
2444 req->status = status;
2445 goto out;
2446 }
2447
2448 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2449 req->status = NVME_CMP_FAILURE | NVME_DNR;
2450 goto out;
2451 }
2452
2453 if (ns->lbaf.ms) {
2454 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2455 uint64_t slba = le64_to_cpu(rw->slba);
2456 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2457 size_t mlen = nvme_m2b(ns, nlb);
2458 uint64_t offset = nvme_moff(ns, slba);
2459
2460 ctx->mdata.bounce = g_malloc(mlen);
2461
2462 qemu_iovec_init(&ctx->mdata.iov, 1);
2463 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2464
2465 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2466 nvme_compare_mdata_cb, req);
2467 return;
2468 }
2469
2470 block_acct_done(stats, acct);
2471
2472 out:
2473 qemu_iovec_destroy(&ctx->data.iov);
2474 g_free(ctx->data.bounce);
2475 g_free(ctx);
2476
2477 nvme_enqueue_req_completion(nvme_cq(req), req);
2478 }
2479
2480 typedef struct NvmeDSMAIOCB {
2481 BlockAIOCB common;
2482 BlockAIOCB *aiocb;
2483 NvmeRequest *req;
2484 int ret;
2485
2486 NvmeDsmRange *range;
2487 unsigned int nr;
2488 unsigned int idx;
2489 } NvmeDSMAIOCB;
2490
nvme_dsm_cancel(BlockAIOCB * aiocb)2491 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2492 {
2493 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2494
2495 /* break nvme_dsm_cb loop */
2496 iocb->idx = iocb->nr;
2497 iocb->ret = -ECANCELED;
2498
2499 if (iocb->aiocb) {
2500 blk_aio_cancel_async(iocb->aiocb);
2501 iocb->aiocb = NULL;
2502 } else {
2503 /*
2504 * We only reach this if nvme_dsm_cancel() has already been called or
2505 * the command ran to completion.
2506 */
2507 assert(iocb->idx == iocb->nr);
2508 }
2509 }
2510
2511 static const AIOCBInfo nvme_dsm_aiocb_info = {
2512 .aiocb_size = sizeof(NvmeDSMAIOCB),
2513 .cancel_async = nvme_dsm_cancel,
2514 };
2515
2516 static void nvme_dsm_cb(void *opaque, int ret);
2517
nvme_dsm_md_cb(void * opaque,int ret)2518 static void nvme_dsm_md_cb(void *opaque, int ret)
2519 {
2520 NvmeDSMAIOCB *iocb = opaque;
2521 NvmeRequest *req = iocb->req;
2522 NvmeNamespace *ns = req->ns;
2523 NvmeDsmRange *range;
2524 uint64_t slba;
2525 uint32_t nlb;
2526
2527 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
2528 goto done;
2529 }
2530
2531 range = &iocb->range[iocb->idx - 1];
2532 slba = le64_to_cpu(range->slba);
2533 nlb = le32_to_cpu(range->nlb);
2534
2535 /*
2536 * Check that all block were discarded (zeroed); otherwise we do not zero
2537 * the metadata.
2538 */
2539
2540 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2541 if (ret) {
2542 if (ret < 0) {
2543 goto done;
2544 }
2545
2546 nvme_dsm_cb(iocb, 0);
2547 return;
2548 }
2549
2550 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2551 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2552 nvme_dsm_cb, iocb);
2553 return;
2554
2555 done:
2556 nvme_dsm_cb(iocb, ret);
2557 }
2558
nvme_dsm_cb(void * opaque,int ret)2559 static void nvme_dsm_cb(void *opaque, int ret)
2560 {
2561 NvmeDSMAIOCB *iocb = opaque;
2562 NvmeRequest *req = iocb->req;
2563 NvmeCtrl *n = nvme_ctrl(req);
2564 NvmeNamespace *ns = req->ns;
2565 NvmeDsmRange *range;
2566 uint64_t slba;
2567 uint32_t nlb;
2568
2569 if (iocb->ret < 0) {
2570 goto done;
2571 } else if (ret < 0) {
2572 iocb->ret = ret;
2573 goto done;
2574 }
2575
2576 next:
2577 if (iocb->idx == iocb->nr) {
2578 goto done;
2579 }
2580
2581 range = &iocb->range[iocb->idx++];
2582 slba = le64_to_cpu(range->slba);
2583 nlb = le32_to_cpu(range->nlb);
2584
2585 trace_pci_nvme_dsm_deallocate(slba, nlb);
2586
2587 if (nlb > n->dmrsl) {
2588 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2589 goto next;
2590 }
2591
2592 if (nvme_check_bounds(ns, slba, nlb)) {
2593 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2594 ns->id_ns.nsze);
2595 goto next;
2596 }
2597
2598 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2599 nvme_l2b(ns, nlb),
2600 nvme_dsm_md_cb, iocb);
2601 return;
2602
2603 done:
2604 iocb->aiocb = NULL;
2605 iocb->common.cb(iocb->common.opaque, iocb->ret);
2606 g_free(iocb->range);
2607 qemu_aio_unref(iocb);
2608 }
2609
nvme_dsm(NvmeCtrl * n,NvmeRequest * req)2610 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2611 {
2612 NvmeNamespace *ns = req->ns;
2613 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2614 uint32_t attr = le32_to_cpu(dsm->attributes);
2615 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2616 uint16_t status = NVME_SUCCESS;
2617
2618 trace_pci_nvme_dsm(nr, attr);
2619
2620 if (attr & NVME_DSMGMT_AD) {
2621 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2622 nvme_misc_cb, req);
2623
2624 iocb->req = req;
2625 iocb->ret = 0;
2626 iocb->range = g_new(NvmeDsmRange, nr);
2627 iocb->nr = nr;
2628 iocb->idx = 0;
2629
2630 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2631 req);
2632 if (status) {
2633 g_free(iocb->range);
2634 qemu_aio_unref(iocb);
2635
2636 return status;
2637 }
2638
2639 req->aiocb = &iocb->common;
2640 nvme_dsm_cb(iocb, 0);
2641
2642 return NVME_NO_COMPLETE;
2643 }
2644
2645 return status;
2646 }
2647
nvme_verify(NvmeCtrl * n,NvmeRequest * req)2648 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2649 {
2650 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2651 NvmeNamespace *ns = req->ns;
2652 BlockBackend *blk = ns->blkconf.blk;
2653 uint64_t slba = le64_to_cpu(rw->slba);
2654 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2655 size_t len = nvme_l2b(ns, nlb);
2656 int64_t offset = nvme_l2b(ns, slba);
2657 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2658 uint32_t reftag = le32_to_cpu(rw->reftag);
2659 NvmeBounceContext *ctx = NULL;
2660 uint16_t status;
2661
2662 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2663
2664 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2665 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2666 if (status) {
2667 return status;
2668 }
2669
2670 if (prinfo & NVME_PRINFO_PRACT) {
2671 return NVME_INVALID_PROT_INFO | NVME_DNR;
2672 }
2673 }
2674
2675 if (len > n->page_size << n->params.vsl) {
2676 return NVME_INVALID_FIELD | NVME_DNR;
2677 }
2678
2679 status = nvme_check_bounds(ns, slba, nlb);
2680 if (status) {
2681 return status;
2682 }
2683
2684 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2685 status = nvme_check_dulbe(ns, slba, nlb);
2686 if (status) {
2687 return status;
2688 }
2689 }
2690
2691 ctx = g_new0(NvmeBounceContext, 1);
2692 ctx->req = req;
2693
2694 ctx->data.bounce = g_malloc(len);
2695
2696 qemu_iovec_init(&ctx->data.iov, 1);
2697 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2698
2699 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2700 BLOCK_ACCT_READ);
2701
2702 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2703 nvme_verify_mdata_in_cb, ctx);
2704 return NVME_NO_COMPLETE;
2705 }
2706
2707 typedef struct NvmeCopyAIOCB {
2708 BlockAIOCB common;
2709 BlockAIOCB *aiocb;
2710 NvmeRequest *req;
2711 NvmeCtrl *n;
2712 int ret;
2713
2714 void *ranges;
2715 unsigned int format;
2716 int nr;
2717 int idx;
2718
2719 uint8_t *bounce;
2720 QEMUIOVector iov;
2721 struct {
2722 BlockAcctCookie read;
2723 BlockAcctCookie write;
2724 } acct;
2725
2726 uint64_t reftag;
2727 uint64_t slba;
2728
2729 NvmeZone *zone;
2730 NvmeNamespace *sns;
2731 uint32_t tcl;
2732 } NvmeCopyAIOCB;
2733
nvme_copy_cancel(BlockAIOCB * aiocb)2734 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2735 {
2736 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2737
2738 iocb->ret = -ECANCELED;
2739
2740 if (iocb->aiocb) {
2741 blk_aio_cancel_async(iocb->aiocb);
2742 iocb->aiocb = NULL;
2743 }
2744 }
2745
2746 static const AIOCBInfo nvme_copy_aiocb_info = {
2747 .aiocb_size = sizeof(NvmeCopyAIOCB),
2748 .cancel_async = nvme_copy_cancel,
2749 };
2750
nvme_copy_done(NvmeCopyAIOCB * iocb)2751 static void nvme_copy_done(NvmeCopyAIOCB *iocb)
2752 {
2753 NvmeRequest *req = iocb->req;
2754 NvmeNamespace *ns = req->ns;
2755 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2756
2757 if (iocb->idx != iocb->nr) {
2758 req->cqe.result = cpu_to_le32(iocb->idx);
2759 }
2760
2761 qemu_iovec_destroy(&iocb->iov);
2762 g_free(iocb->bounce);
2763
2764 if (iocb->ret < 0) {
2765 block_acct_failed(stats, &iocb->acct.read);
2766 block_acct_failed(stats, &iocb->acct.write);
2767 } else {
2768 block_acct_done(stats, &iocb->acct.read);
2769 block_acct_done(stats, &iocb->acct.write);
2770 }
2771
2772 iocb->common.cb(iocb->common.opaque, iocb->ret);
2773 qemu_aio_unref(iocb);
2774 }
2775
2776 static void nvme_do_copy(NvmeCopyAIOCB *iocb);
2777
nvme_copy_source_range_parse_format0_2(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2778 static void nvme_copy_source_range_parse_format0_2(void *ranges,
2779 int idx, uint64_t *slba,
2780 uint32_t *nlb,
2781 uint32_t *snsid,
2782 uint16_t *apptag,
2783 uint16_t *appmask,
2784 uint64_t *reftag)
2785 {
2786 NvmeCopySourceRangeFormat0_2 *_ranges = ranges;
2787
2788 if (snsid) {
2789 *snsid = le32_to_cpu(_ranges[idx].sparams);
2790 }
2791
2792 if (slba) {
2793 *slba = le64_to_cpu(_ranges[idx].slba);
2794 }
2795
2796 if (nlb) {
2797 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2798 }
2799
2800 if (apptag) {
2801 *apptag = le16_to_cpu(_ranges[idx].apptag);
2802 }
2803
2804 if (appmask) {
2805 *appmask = le16_to_cpu(_ranges[idx].appmask);
2806 }
2807
2808 if (reftag) {
2809 *reftag = le32_to_cpu(_ranges[idx].reftag);
2810 }
2811 }
2812
nvme_copy_source_range_parse_format1_3(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2813 static void nvme_copy_source_range_parse_format1_3(void *ranges, int idx,
2814 uint64_t *slba,
2815 uint32_t *nlb,
2816 uint32_t *snsid,
2817 uint16_t *apptag,
2818 uint16_t *appmask,
2819 uint64_t *reftag)
2820 {
2821 NvmeCopySourceRangeFormat1_3 *_ranges = ranges;
2822
2823 if (snsid) {
2824 *snsid = le32_to_cpu(_ranges[idx].sparams);
2825 }
2826
2827 if (slba) {
2828 *slba = le64_to_cpu(_ranges[idx].slba);
2829 }
2830
2831 if (nlb) {
2832 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2833 }
2834
2835 if (apptag) {
2836 *apptag = le16_to_cpu(_ranges[idx].apptag);
2837 }
2838
2839 if (appmask) {
2840 *appmask = le16_to_cpu(_ranges[idx].appmask);
2841 }
2842
2843 if (reftag) {
2844 *reftag = 0;
2845
2846 *reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
2847 *reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
2848 *reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
2849 *reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
2850 *reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
2851 *reftag |= (uint64_t)_ranges[idx].sr[9];
2852 }
2853 }
2854
nvme_copy_source_range_parse(void * ranges,int idx,uint8_t format,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2855 static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
2856 uint64_t *slba, uint32_t *nlb,
2857 uint32_t *snsid, uint16_t *apptag,
2858 uint16_t *appmask, uint64_t *reftag)
2859 {
2860 switch (format) {
2861 case NVME_COPY_FORMAT_0:
2862 case NVME_COPY_FORMAT_2:
2863 nvme_copy_source_range_parse_format0_2(ranges, idx, slba, nlb, snsid,
2864 apptag, appmask, reftag);
2865 break;
2866
2867 case NVME_COPY_FORMAT_1:
2868 case NVME_COPY_FORMAT_3:
2869 nvme_copy_source_range_parse_format1_3(ranges, idx, slba, nlb, snsid,
2870 apptag, appmask, reftag);
2871 break;
2872
2873 default:
2874 abort();
2875 }
2876 }
2877
nvme_check_copy_mcl(NvmeNamespace * ns,NvmeCopyAIOCB * iocb,uint16_t nr)2878 static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
2879 NvmeCopyAIOCB *iocb, uint16_t nr)
2880 {
2881 uint32_t copy_len = 0;
2882
2883 for (int idx = 0; idx < nr; idx++) {
2884 uint32_t nlb;
2885 nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
2886 &nlb, NULL, NULL, NULL, NULL);
2887 copy_len += nlb;
2888 }
2889 iocb->tcl = copy_len;
2890 if (copy_len > ns->id_ns.mcl) {
2891 return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2892 }
2893
2894 return NVME_SUCCESS;
2895 }
2896
nvme_copy_out_completed_cb(void * opaque,int ret)2897 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2898 {
2899 NvmeCopyAIOCB *iocb = opaque;
2900 NvmeRequest *req = iocb->req;
2901 NvmeNamespace *dns = req->ns;
2902 uint32_t nlb;
2903
2904 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2905 &nlb, NULL, NULL, NULL, NULL);
2906
2907 if (ret < 0) {
2908 iocb->ret = ret;
2909 goto out;
2910 } else if (iocb->ret < 0) {
2911 goto out;
2912 }
2913
2914 if (dns->params.zoned) {
2915 nvme_advance_zone_wp(dns, iocb->zone, nlb);
2916 }
2917
2918 iocb->idx++;
2919 iocb->slba += nlb;
2920 out:
2921 nvme_do_copy(iocb);
2922 }
2923
nvme_copy_out_cb(void * opaque,int ret)2924 static void nvme_copy_out_cb(void *opaque, int ret)
2925 {
2926 NvmeCopyAIOCB *iocb = opaque;
2927 NvmeRequest *req = iocb->req;
2928 NvmeNamespace *dns = req->ns;
2929 uint32_t nlb;
2930 size_t mlen;
2931 uint8_t *mbounce;
2932
2933 if (ret < 0 || iocb->ret < 0 || !dns->lbaf.ms) {
2934 goto out;
2935 }
2936
2937 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2938 &nlb, NULL, NULL, NULL, NULL);
2939
2940 mlen = nvme_m2b(dns, nlb);
2941 mbounce = iocb->bounce + nvme_l2b(dns, nlb);
2942
2943 qemu_iovec_reset(&iocb->iov);
2944 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2945
2946 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_moff(dns, iocb->slba),
2947 &iocb->iov, 0, nvme_copy_out_completed_cb,
2948 iocb);
2949
2950 return;
2951
2952 out:
2953 nvme_copy_out_completed_cb(iocb, ret);
2954 }
2955
nvme_copy_in_completed_cb(void * opaque,int ret)2956 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2957 {
2958 NvmeCopyAIOCB *iocb = opaque;
2959 NvmeRequest *req = iocb->req;
2960 NvmeNamespace *sns = iocb->sns;
2961 NvmeNamespace *dns = req->ns;
2962 NvmeCopyCmd *copy = NULL;
2963 uint8_t *mbounce = NULL;
2964 uint32_t nlb;
2965 uint64_t slba;
2966 uint16_t apptag, appmask;
2967 uint64_t reftag;
2968 size_t len, mlen;
2969 uint16_t status;
2970
2971 if (ret < 0) {
2972 iocb->ret = ret;
2973 goto out;
2974 } else if (iocb->ret < 0) {
2975 goto out;
2976 }
2977
2978 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2979 &nlb, NULL, &apptag, &appmask, &reftag);
2980
2981 trace_pci_nvme_copy_out(iocb->slba, nlb);
2982
2983 len = nvme_l2b(sns, nlb);
2984
2985 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps)) {
2986 copy = (NvmeCopyCmd *)&req->cmd;
2987
2988 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2989
2990 mlen = nvme_m2b(sns, nlb);
2991 mbounce = iocb->bounce + nvme_l2b(sns, nlb);
2992
2993 status = nvme_dif_mangle_mdata(sns, mbounce, mlen, slba);
2994 if (status) {
2995 goto invalid;
2996 }
2997 status = nvme_dif_check(sns, iocb->bounce, len, mbounce, mlen, prinfor,
2998 slba, apptag, appmask, &reftag);
2999 if (status) {
3000 goto invalid;
3001 }
3002 }
3003
3004 if (NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3005 copy = (NvmeCopyCmd *)&req->cmd;
3006 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3007
3008 mlen = nvme_m2b(dns, nlb);
3009 mbounce = iocb->bounce + nvme_l2b(dns, nlb);
3010
3011 apptag = le16_to_cpu(copy->apptag);
3012 appmask = le16_to_cpu(copy->appmask);
3013
3014 if (prinfow & NVME_PRINFO_PRACT) {
3015 status = nvme_check_prinfo(dns, prinfow, iocb->slba, iocb->reftag);
3016 if (status) {
3017 goto invalid;
3018 }
3019
3020 nvme_dif_pract_generate_dif(dns, iocb->bounce, len, mbounce, mlen,
3021 apptag, &iocb->reftag);
3022 } else {
3023 status = nvme_dif_check(dns, iocb->bounce, len, mbounce, mlen,
3024 prinfow, iocb->slba, apptag, appmask,
3025 &iocb->reftag);
3026 if (status) {
3027 goto invalid;
3028 }
3029 }
3030 }
3031
3032 status = nvme_check_bounds(dns, iocb->slba, nlb);
3033 if (status) {
3034 goto invalid;
3035 }
3036
3037 if (dns->params.zoned) {
3038 status = nvme_check_zone_write(dns, iocb->zone, iocb->slba, nlb);
3039 if (status) {
3040 goto invalid;
3041 }
3042
3043 if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) {
3044 iocb->zone->w_ptr += nlb;
3045 }
3046 }
3047
3048 qemu_iovec_reset(&iocb->iov);
3049 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3050
3051 block_acct_start(blk_get_stats(dns->blkconf.blk), &iocb->acct.write, 0,
3052 BLOCK_ACCT_WRITE);
3053
3054 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_l2b(dns, iocb->slba),
3055 &iocb->iov, 0, nvme_copy_out_cb, iocb);
3056
3057 return;
3058
3059 invalid:
3060 req->status = status;
3061 iocb->ret = -1;
3062 out:
3063 nvme_do_copy(iocb);
3064 }
3065
nvme_copy_in_cb(void * opaque,int ret)3066 static void nvme_copy_in_cb(void *opaque, int ret)
3067 {
3068 NvmeCopyAIOCB *iocb = opaque;
3069 NvmeNamespace *sns = iocb->sns;
3070 uint64_t slba;
3071 uint32_t nlb;
3072
3073 if (ret < 0 || iocb->ret < 0 || !sns->lbaf.ms) {
3074 goto out;
3075 }
3076
3077 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3078 &nlb, NULL, NULL, NULL, NULL);
3079
3080 qemu_iovec_reset(&iocb->iov);
3081 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(sns, nlb),
3082 nvme_m2b(sns, nlb));
3083
3084 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_moff(sns, slba),
3085 &iocb->iov, 0, nvme_copy_in_completed_cb,
3086 iocb);
3087 return;
3088
3089 out:
3090 nvme_copy_in_completed_cb(iocb, ret);
3091 }
3092
nvme_csi_supports_copy(uint8_t csi)3093 static inline bool nvme_csi_supports_copy(uint8_t csi)
3094 {
3095 return csi == NVME_CSI_NVM || csi == NVME_CSI_ZONED;
3096 }
3097
nvme_copy_ns_format_match(NvmeNamespace * sns,NvmeNamespace * dns)3098 static inline bool nvme_copy_ns_format_match(NvmeNamespace *sns,
3099 NvmeNamespace *dns)
3100 {
3101 return sns->lbaf.ds == dns->lbaf.ds && sns->lbaf.ms == dns->lbaf.ms;
3102 }
3103
nvme_copy_matching_ns_format(NvmeNamespace * sns,NvmeNamespace * dns,bool pi_enable)3104 static bool nvme_copy_matching_ns_format(NvmeNamespace *sns, NvmeNamespace *dns,
3105 bool pi_enable)
3106 {
3107 if (!nvme_csi_supports_copy(sns->csi) ||
3108 !nvme_csi_supports_copy(dns->csi)) {
3109 return false;
3110 }
3111
3112 if (!pi_enable && !nvme_copy_ns_format_match(sns, dns)) {
3113 return false;
3114 }
3115
3116 if (pi_enable && (!nvme_copy_ns_format_match(sns, dns) ||
3117 sns->id_ns.dps != dns->id_ns.dps)) {
3118 return false;
3119 }
3120
3121 return true;
3122 }
3123
nvme_copy_corresp_pi_match(NvmeNamespace * sns,NvmeNamespace * dns)3124 static inline bool nvme_copy_corresp_pi_match(NvmeNamespace *sns,
3125 NvmeNamespace *dns)
3126 {
3127 return sns->lbaf.ms == 0 &&
3128 ((dns->lbaf.ms == 8 && dns->pif == 0) ||
3129 (dns->lbaf.ms == 16 && dns->pif == 1));
3130 }
3131
nvme_copy_corresp_pi_format(NvmeNamespace * sns,NvmeNamespace * dns,bool sns_pi_en)3132 static bool nvme_copy_corresp_pi_format(NvmeNamespace *sns, NvmeNamespace *dns,
3133 bool sns_pi_en)
3134 {
3135 if (!nvme_csi_supports_copy(sns->csi) ||
3136 !nvme_csi_supports_copy(dns->csi)) {
3137 return false;
3138 }
3139
3140 if (!sns_pi_en && !nvme_copy_corresp_pi_match(sns, dns)) {
3141 return false;
3142 }
3143
3144 if (sns_pi_en && !nvme_copy_corresp_pi_match(dns, sns)) {
3145 return false;
3146 }
3147
3148 return true;
3149 }
3150
nvme_do_copy(NvmeCopyAIOCB * iocb)3151 static void nvme_do_copy(NvmeCopyAIOCB *iocb)
3152 {
3153 NvmeRequest *req = iocb->req;
3154 NvmeNamespace *sns;
3155 NvmeNamespace *dns = req->ns;
3156 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3157 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3158 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3159 uint64_t slba;
3160 uint32_t nlb;
3161 size_t len;
3162 uint16_t status;
3163 uint32_t dnsid = le32_to_cpu(req->cmd.nsid);
3164 uint32_t snsid = dnsid;
3165
3166 if (iocb->ret < 0) {
3167 goto done;
3168 }
3169
3170 if (iocb->idx == iocb->nr) {
3171 goto done;
3172 }
3173
3174 if (iocb->format == 2 || iocb->format == 3) {
3175 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3176 &slba, &nlb, &snsid, NULL, NULL, NULL);
3177 if (snsid != dnsid) {
3178 if (snsid == NVME_NSID_BROADCAST ||
3179 !nvme_nsid_valid(iocb->n, snsid)) {
3180 status = NVME_INVALID_NSID | NVME_DNR;
3181 goto invalid;
3182 }
3183 iocb->sns = nvme_ns(iocb->n, snsid);
3184 if (unlikely(!iocb->sns)) {
3185 status = NVME_INVALID_FIELD | NVME_DNR;
3186 goto invalid;
3187 }
3188 } else {
3189 if (((slba + nlb) > iocb->slba) &&
3190 ((slba + nlb) < (iocb->slba + iocb->tcl))) {
3191 status = NVME_CMD_OVERLAP_IO_RANGE | NVME_DNR;
3192 goto invalid;
3193 }
3194 }
3195 } else {
3196 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3197 &slba, &nlb, NULL, NULL, NULL, NULL);
3198 }
3199
3200 sns = iocb->sns;
3201 if ((snsid == dnsid) && NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3202 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
3203 status = NVME_INVALID_FIELD | NVME_DNR;
3204 goto invalid;
3205 } else if (snsid != dnsid) {
3206 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3207 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3208 if (!nvme_copy_matching_ns_format(sns, dns, false)) {
3209 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3210 goto invalid;
3211 }
3212 }
3213 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3214 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3215 if ((prinfor & NVME_PRINFO_PRACT) !=
3216 (prinfow & NVME_PRINFO_PRACT)) {
3217 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3218 goto invalid;
3219 } else {
3220 if (!nvme_copy_matching_ns_format(sns, dns, true)) {
3221 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3222 goto invalid;
3223 }
3224 }
3225 }
3226
3227 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3228 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3229 if (!(prinfow & NVME_PRINFO_PRACT)) {
3230 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3231 goto invalid;
3232 } else {
3233 if (!nvme_copy_corresp_pi_format(sns, dns, false)) {
3234 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3235 goto invalid;
3236 }
3237 }
3238 }
3239
3240 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3241 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3242 if (!(prinfor & NVME_PRINFO_PRACT)) {
3243 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3244 goto invalid;
3245 } else {
3246 if (!nvme_copy_corresp_pi_format(sns, dns, true)) {
3247 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3248 goto invalid;
3249 }
3250 }
3251 }
3252 }
3253 len = nvme_l2b(sns, nlb);
3254
3255 trace_pci_nvme_copy_source_range(slba, nlb);
3256
3257 if (nlb > le16_to_cpu(sns->id_ns.mssrl)) {
3258 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3259 goto invalid;
3260 }
3261
3262 status = nvme_check_bounds(sns, slba, nlb);
3263 if (status) {
3264 goto invalid;
3265 }
3266
3267 if (NVME_ERR_REC_DULBE(sns->features.err_rec)) {
3268 status = nvme_check_dulbe(sns, slba, nlb);
3269 if (status) {
3270 goto invalid;
3271 }
3272 }
3273
3274 if (sns->params.zoned) {
3275 status = nvme_check_zone_read(sns, slba, nlb);
3276 if (status) {
3277 goto invalid;
3278 }
3279 }
3280
3281 g_free(iocb->bounce);
3282 iocb->bounce = g_malloc_n(le16_to_cpu(sns->id_ns.mssrl),
3283 sns->lbasz + sns->lbaf.ms);
3284
3285 qemu_iovec_reset(&iocb->iov);
3286 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3287
3288 block_acct_start(blk_get_stats(sns->blkconf.blk), &iocb->acct.read, 0,
3289 BLOCK_ACCT_READ);
3290
3291 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_l2b(sns, slba),
3292 &iocb->iov, 0, nvme_copy_in_cb, iocb);
3293 return;
3294
3295 invalid:
3296 req->status = status;
3297 iocb->ret = -1;
3298 done:
3299 nvme_copy_done(iocb);
3300 }
3301
nvme_copy(NvmeCtrl * n,NvmeRequest * req)3302 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
3303 {
3304 NvmeNamespace *ns = req->ns;
3305 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3306 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
3307 nvme_misc_cb, req);
3308 uint16_t nr = copy->nr + 1;
3309 uint8_t format = copy->control[0] & 0xf;
3310 size_t len = sizeof(NvmeCopySourceRangeFormat0_2);
3311
3312 uint16_t status;
3313
3314 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
3315
3316 iocb->ranges = NULL;
3317 iocb->zone = NULL;
3318
3319 if (!(n->id_ctrl.ocfs & (1 << format)) ||
3320 ((format == 2 || format == 3) &&
3321 !(n->features.hbs.cdfe & (1 << format)))) {
3322 trace_pci_nvme_err_copy_invalid_format(format);
3323 status = NVME_INVALID_FIELD | NVME_DNR;
3324 goto invalid;
3325 }
3326
3327 if (nr > ns->id_ns.msrc + 1) {
3328 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3329 goto invalid;
3330 }
3331
3332 if ((ns->pif == 0x0 && (format != 0x0 && format != 0x2)) ||
3333 (ns->pif != 0x0 && (format != 0x1 && format != 0x3))) {
3334 status = NVME_INVALID_FORMAT | NVME_DNR;
3335 goto invalid;
3336 }
3337
3338 if (ns->pif) {
3339 len = sizeof(NvmeCopySourceRangeFormat1_3);
3340 }
3341
3342 iocb->format = format;
3343 iocb->ranges = g_malloc_n(nr, len);
3344 status = nvme_h2c(n, (uint8_t *)iocb->ranges, len * nr, req);
3345 if (status) {
3346 goto invalid;
3347 }
3348
3349 iocb->slba = le64_to_cpu(copy->sdlba);
3350
3351 if (ns->params.zoned) {
3352 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
3353 if (!iocb->zone) {
3354 status = NVME_LBA_RANGE | NVME_DNR;
3355 goto invalid;
3356 }
3357
3358 status = nvme_zrm_auto(n, ns, iocb->zone);
3359 if (status) {
3360 goto invalid;
3361 }
3362 }
3363
3364 status = nvme_check_copy_mcl(ns, iocb, nr);
3365 if (status) {
3366 goto invalid;
3367 }
3368
3369 iocb->req = req;
3370 iocb->ret = 0;
3371 iocb->nr = nr;
3372 iocb->idx = 0;
3373 iocb->reftag = le32_to_cpu(copy->reftag);
3374 iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32;
3375
3376 qemu_iovec_init(&iocb->iov, 1);
3377
3378 req->aiocb = &iocb->common;
3379 iocb->sns = req->ns;
3380 iocb->n = n;
3381 iocb->bounce = NULL;
3382 nvme_do_copy(iocb);
3383
3384 return NVME_NO_COMPLETE;
3385
3386 invalid:
3387 g_free(iocb->ranges);
3388 qemu_aio_unref(iocb);
3389 return status;
3390 }
3391
nvme_compare(NvmeCtrl * n,NvmeRequest * req)3392 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
3393 {
3394 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3395 NvmeNamespace *ns = req->ns;
3396 BlockBackend *blk = ns->blkconf.blk;
3397 uint64_t slba = le64_to_cpu(rw->slba);
3398 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
3399 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3400 size_t data_len = nvme_l2b(ns, nlb);
3401 size_t len = data_len;
3402 int64_t offset = nvme_l2b(ns, slba);
3403 struct nvme_compare_ctx *ctx = NULL;
3404 uint16_t status;
3405
3406 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
3407
3408 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
3409 return NVME_INVALID_PROT_INFO | NVME_DNR;
3410 }
3411
3412 if (nvme_ns_ext(ns)) {
3413 len += nvme_m2b(ns, nlb);
3414 }
3415
3416 status = nvme_check_mdts(n, len);
3417 if (status) {
3418 return status;
3419 }
3420
3421 status = nvme_check_bounds(ns, slba, nlb);
3422 if (status) {
3423 return status;
3424 }
3425
3426 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3427 status = nvme_check_dulbe(ns, slba, nlb);
3428 if (status) {
3429 return status;
3430 }
3431 }
3432
3433 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
3434 if (status) {
3435 return status;
3436 }
3437
3438 ctx = g_new(struct nvme_compare_ctx, 1);
3439 ctx->data.bounce = g_malloc(data_len);
3440
3441 req->opaque = ctx;
3442
3443 qemu_iovec_init(&ctx->data.iov, 1);
3444 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
3445
3446 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
3447 BLOCK_ACCT_READ);
3448 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
3449 nvme_compare_data_cb, req);
3450
3451 return NVME_NO_COMPLETE;
3452 }
3453
3454 typedef struct NvmeFlushAIOCB {
3455 BlockAIOCB common;
3456 BlockAIOCB *aiocb;
3457 NvmeRequest *req;
3458 int ret;
3459
3460 NvmeNamespace *ns;
3461 uint32_t nsid;
3462 bool broadcast;
3463 } NvmeFlushAIOCB;
3464
nvme_flush_cancel(BlockAIOCB * acb)3465 static void nvme_flush_cancel(BlockAIOCB *acb)
3466 {
3467 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
3468
3469 iocb->ret = -ECANCELED;
3470
3471 if (iocb->aiocb) {
3472 blk_aio_cancel_async(iocb->aiocb);
3473 iocb->aiocb = NULL;
3474 }
3475 }
3476
3477 static const AIOCBInfo nvme_flush_aiocb_info = {
3478 .aiocb_size = sizeof(NvmeFlushAIOCB),
3479 .cancel_async = nvme_flush_cancel,
3480 };
3481
3482 static void nvme_do_flush(NvmeFlushAIOCB *iocb);
3483
nvme_flush_ns_cb(void * opaque,int ret)3484 static void nvme_flush_ns_cb(void *opaque, int ret)
3485 {
3486 NvmeFlushAIOCB *iocb = opaque;
3487 NvmeNamespace *ns = iocb->ns;
3488
3489 if (ret < 0) {
3490 iocb->ret = ret;
3491 goto out;
3492 } else if (iocb->ret < 0) {
3493 goto out;
3494 }
3495
3496 if (ns) {
3497 trace_pci_nvme_flush_ns(iocb->nsid);
3498
3499 iocb->ns = NULL;
3500 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
3501 return;
3502 }
3503
3504 out:
3505 nvme_do_flush(iocb);
3506 }
3507
nvme_do_flush(NvmeFlushAIOCB * iocb)3508 static void nvme_do_flush(NvmeFlushAIOCB *iocb)
3509 {
3510 NvmeRequest *req = iocb->req;
3511 NvmeCtrl *n = nvme_ctrl(req);
3512 int i;
3513
3514 if (iocb->ret < 0) {
3515 goto done;
3516 }
3517
3518 if (iocb->broadcast) {
3519 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
3520 iocb->ns = nvme_ns(n, i);
3521 if (iocb->ns) {
3522 iocb->nsid = i;
3523 break;
3524 }
3525 }
3526 }
3527
3528 if (!iocb->ns) {
3529 goto done;
3530 }
3531
3532 nvme_flush_ns_cb(iocb, 0);
3533 return;
3534
3535 done:
3536 iocb->common.cb(iocb->common.opaque, iocb->ret);
3537 qemu_aio_unref(iocb);
3538 }
3539
nvme_flush(NvmeCtrl * n,NvmeRequest * req)3540 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3541 {
3542 NvmeFlushAIOCB *iocb;
3543 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3544 uint16_t status;
3545
3546 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3547
3548 iocb->req = req;
3549 iocb->ret = 0;
3550 iocb->ns = NULL;
3551 iocb->nsid = 0;
3552 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3553
3554 if (!iocb->broadcast) {
3555 if (!nvme_nsid_valid(n, nsid)) {
3556 status = NVME_INVALID_NSID | NVME_DNR;
3557 goto out;
3558 }
3559
3560 iocb->ns = nvme_ns(n, nsid);
3561 if (!iocb->ns) {
3562 status = NVME_INVALID_FIELD | NVME_DNR;
3563 goto out;
3564 }
3565
3566 iocb->nsid = nsid;
3567 }
3568
3569 req->aiocb = &iocb->common;
3570 nvme_do_flush(iocb);
3571
3572 return NVME_NO_COMPLETE;
3573
3574 out:
3575 qemu_aio_unref(iocb);
3576
3577 return status;
3578 }
3579
nvme_read(NvmeCtrl * n,NvmeRequest * req)3580 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3581 {
3582 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3583 NvmeNamespace *ns = req->ns;
3584 uint64_t slba = le64_to_cpu(rw->slba);
3585 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3586 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3587 uint64_t data_size = nvme_l2b(ns, nlb);
3588 uint64_t mapped_size = data_size;
3589 uint64_t data_offset;
3590 BlockBackend *blk = ns->blkconf.blk;
3591 uint16_t status;
3592
3593 if (nvme_ns_ext(ns)) {
3594 mapped_size += nvme_m2b(ns, nlb);
3595
3596 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3597 bool pract = prinfo & NVME_PRINFO_PRACT;
3598
3599 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3600 mapped_size = data_size;
3601 }
3602 }
3603 }
3604
3605 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3606
3607 status = nvme_check_mdts(n, mapped_size);
3608 if (status) {
3609 goto invalid;
3610 }
3611
3612 status = nvme_check_bounds(ns, slba, nlb);
3613 if (status) {
3614 goto invalid;
3615 }
3616
3617 if (ns->params.zoned) {
3618 status = nvme_check_zone_read(ns, slba, nlb);
3619 if (status) {
3620 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3621 goto invalid;
3622 }
3623 }
3624
3625 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3626 status = nvme_check_dulbe(ns, slba, nlb);
3627 if (status) {
3628 goto invalid;
3629 }
3630 }
3631
3632 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3633 return nvme_dif_rw(n, req);
3634 }
3635
3636 status = nvme_map_data(n, nlb, req);
3637 if (status) {
3638 goto invalid;
3639 }
3640
3641 data_offset = nvme_l2b(ns, slba);
3642
3643 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3644 BLOCK_ACCT_READ);
3645 nvme_blk_read(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3646 return NVME_NO_COMPLETE;
3647
3648 invalid:
3649 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3650 return status | NVME_DNR;
3651 }
3652
nvme_do_write_fdp(NvmeCtrl * n,NvmeRequest * req,uint64_t slba,uint32_t nlb)3653 static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba,
3654 uint32_t nlb)
3655 {
3656 NvmeNamespace *ns = req->ns;
3657 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3658 uint64_t data_size = nvme_l2b(ns, nlb);
3659 uint32_t dw12 = le32_to_cpu(req->cmd.cdw12);
3660 uint8_t dtype = (dw12 >> 20) & 0xf;
3661 uint16_t pid = le16_to_cpu(rw->dspec);
3662 uint16_t ph, rg, ruhid;
3663 NvmeReclaimUnit *ru;
3664
3665 if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT ||
3666 !nvme_parse_pid(ns, pid, &ph, &rg)) {
3667 ph = 0;
3668 rg = 0;
3669 }
3670
3671 ruhid = ns->fdp.phs[ph];
3672 ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg];
3673
3674 nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size);
3675 nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size);
3676
3677 while (nlb) {
3678 if (nlb < ru->ruamw) {
3679 ru->ruamw -= nlb;
3680 break;
3681 }
3682
3683 nlb -= ru->ruamw;
3684 nvme_update_ruh(n, ns, pid);
3685 }
3686 }
3687
nvme_do_write(NvmeCtrl * n,NvmeRequest * req,bool append,bool wrz)3688 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3689 bool wrz)
3690 {
3691 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3692 NvmeNamespace *ns = req->ns;
3693 uint64_t slba = le64_to_cpu(rw->slba);
3694 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3695 uint16_t ctrl = le16_to_cpu(rw->control);
3696 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3697 uint64_t data_size = nvme_l2b(ns, nlb);
3698 uint64_t mapped_size = data_size;
3699 uint64_t data_offset;
3700 NvmeZone *zone;
3701 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3702 BlockBackend *blk = ns->blkconf.blk;
3703 uint16_t status;
3704
3705 if (nvme_ns_ext(ns)) {
3706 mapped_size += nvme_m2b(ns, nlb);
3707
3708 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3709 bool pract = prinfo & NVME_PRINFO_PRACT;
3710
3711 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3712 mapped_size -= nvme_m2b(ns, nlb);
3713 }
3714 }
3715 }
3716
3717 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3718 nvme_nsid(ns), nlb, mapped_size, slba);
3719
3720 if (!wrz) {
3721 status = nvme_check_mdts(n, mapped_size);
3722 if (status) {
3723 goto invalid;
3724 }
3725 }
3726
3727 status = nvme_check_bounds(ns, slba, nlb);
3728 if (status) {
3729 goto invalid;
3730 }
3731
3732 if (ns->params.zoned) {
3733 zone = nvme_get_zone_by_slba(ns, slba);
3734 assert(zone);
3735
3736 if (append) {
3737 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3738
3739 if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3740 return NVME_INVALID_ZONE_OP | NVME_DNR;
3741 }
3742
3743 if (unlikely(slba != zone->d.zslba)) {
3744 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3745 status = NVME_INVALID_FIELD;
3746 goto invalid;
3747 }
3748
3749 if (n->params.zasl &&
3750 data_size > (uint64_t)n->page_size << n->params.zasl) {
3751 trace_pci_nvme_err_zasl(data_size);
3752 return NVME_INVALID_FIELD | NVME_DNR;
3753 }
3754
3755 slba = zone->w_ptr;
3756 rw->slba = cpu_to_le64(slba);
3757 res->slba = cpu_to_le64(slba);
3758
3759 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3760 case NVME_ID_NS_DPS_TYPE_1:
3761 if (!piremap) {
3762 return NVME_INVALID_PROT_INFO | NVME_DNR;
3763 }
3764
3765 /* fallthrough */
3766
3767 case NVME_ID_NS_DPS_TYPE_2:
3768 if (piremap) {
3769 uint32_t reftag = le32_to_cpu(rw->reftag);
3770 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3771 }
3772
3773 break;
3774
3775 case NVME_ID_NS_DPS_TYPE_3:
3776 if (piremap) {
3777 return NVME_INVALID_PROT_INFO | NVME_DNR;
3778 }
3779
3780 break;
3781 }
3782 }
3783
3784 status = nvme_check_zone_write(ns, zone, slba, nlb);
3785 if (status) {
3786 goto invalid;
3787 }
3788
3789 status = nvme_zrm_auto(n, ns, zone);
3790 if (status) {
3791 goto invalid;
3792 }
3793
3794 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3795 zone->w_ptr += nlb;
3796 }
3797 } else if (ns->endgrp && ns->endgrp->fdp.enabled) {
3798 nvme_do_write_fdp(n, req, slba, nlb);
3799 }
3800
3801 data_offset = nvme_l2b(ns, slba);
3802
3803 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3804 return nvme_dif_rw(n, req);
3805 }
3806
3807 if (!wrz) {
3808 status = nvme_map_data(n, nlb, req);
3809 if (status) {
3810 goto invalid;
3811 }
3812
3813 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3814 BLOCK_ACCT_WRITE);
3815 nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3816 } else {
3817 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3818 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3819 req);
3820 }
3821
3822 return NVME_NO_COMPLETE;
3823
3824 invalid:
3825 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3826 return status | NVME_DNR;
3827 }
3828
nvme_write(NvmeCtrl * n,NvmeRequest * req)3829 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3830 {
3831 return nvme_do_write(n, req, false, false);
3832 }
3833
nvme_write_zeroes(NvmeCtrl * n,NvmeRequest * req)3834 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3835 {
3836 return nvme_do_write(n, req, false, true);
3837 }
3838
nvme_zone_append(NvmeCtrl * n,NvmeRequest * req)3839 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3840 {
3841 return nvme_do_write(n, req, true, false);
3842 }
3843
nvme_get_mgmt_zone_slba_idx(NvmeNamespace * ns,NvmeCmd * c,uint64_t * slba,uint32_t * zone_idx)3844 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3845 uint64_t *slba, uint32_t *zone_idx)
3846 {
3847 uint32_t dw10 = le32_to_cpu(c->cdw10);
3848 uint32_t dw11 = le32_to_cpu(c->cdw11);
3849
3850 if (!ns->params.zoned) {
3851 trace_pci_nvme_err_invalid_opc(c->opcode);
3852 return NVME_INVALID_OPCODE | NVME_DNR;
3853 }
3854
3855 *slba = ((uint64_t)dw11) << 32 | dw10;
3856 if (unlikely(*slba >= ns->id_ns.nsze)) {
3857 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3858 *slba = 0;
3859 return NVME_LBA_RANGE | NVME_DNR;
3860 }
3861
3862 *zone_idx = nvme_zone_idx(ns, *slba);
3863 assert(*zone_idx < ns->num_zones);
3864
3865 return NVME_SUCCESS;
3866 }
3867
3868 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3869 NvmeRequest *);
3870
3871 enum NvmeZoneProcessingMask {
3872 NVME_PROC_CURRENT_ZONE = 0,
3873 NVME_PROC_OPENED_ZONES = 1 << 0,
3874 NVME_PROC_CLOSED_ZONES = 1 << 1,
3875 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3876 NVME_PROC_FULL_ZONES = 1 << 3,
3877 };
3878
nvme_open_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3879 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3880 NvmeZoneState state, NvmeRequest *req)
3881 {
3882 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3883 int flags = 0;
3884
3885 if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) {
3886 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3887
3888 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3889 return NVME_INVALID_ZONE_OP | NVME_DNR;
3890 }
3891
3892 if (zone->w_ptr % ns->zns.zrwafg) {
3893 return NVME_NOZRWA | NVME_DNR;
3894 }
3895
3896 flags = NVME_ZRM_ZRWA;
3897 }
3898
3899 return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags);
3900 }
3901
nvme_close_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3902 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3903 NvmeZoneState state, NvmeRequest *req)
3904 {
3905 return nvme_zrm_close(ns, zone);
3906 }
3907
nvme_finish_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3908 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3909 NvmeZoneState state, NvmeRequest *req)
3910 {
3911 return nvme_zrm_finish(ns, zone);
3912 }
3913
nvme_offline_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3914 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3915 NvmeZoneState state, NvmeRequest *req)
3916 {
3917 switch (state) {
3918 case NVME_ZONE_STATE_READ_ONLY:
3919 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3920 /* fall through */
3921 case NVME_ZONE_STATE_OFFLINE:
3922 return NVME_SUCCESS;
3923 default:
3924 return NVME_ZONE_INVAL_TRANSITION;
3925 }
3926 }
3927
nvme_set_zd_ext(NvmeNamespace * ns,NvmeZone * zone)3928 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3929 {
3930 uint16_t status;
3931 uint8_t state = nvme_get_zone_state(zone);
3932
3933 if (state == NVME_ZONE_STATE_EMPTY) {
3934 status = nvme_aor_check(ns, 1, 0);
3935 if (status) {
3936 return status;
3937 }
3938 nvme_aor_inc_active(ns);
3939 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3940 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3941 return NVME_SUCCESS;
3942 }
3943
3944 return NVME_ZONE_INVAL_TRANSITION;
3945 }
3946
nvme_bulk_proc_zone(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)3947 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3948 enum NvmeZoneProcessingMask proc_mask,
3949 op_handler_t op_hndlr, NvmeRequest *req)
3950 {
3951 uint16_t status = NVME_SUCCESS;
3952 NvmeZoneState zs = nvme_get_zone_state(zone);
3953 bool proc_zone;
3954
3955 switch (zs) {
3956 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3957 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3958 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3959 break;
3960 case NVME_ZONE_STATE_CLOSED:
3961 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3962 break;
3963 case NVME_ZONE_STATE_READ_ONLY:
3964 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3965 break;
3966 case NVME_ZONE_STATE_FULL:
3967 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3968 break;
3969 default:
3970 proc_zone = false;
3971 }
3972
3973 if (proc_zone) {
3974 status = op_hndlr(ns, zone, zs, req);
3975 }
3976
3977 return status;
3978 }
3979
nvme_do_zone_op(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)3980 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3981 enum NvmeZoneProcessingMask proc_mask,
3982 op_handler_t op_hndlr, NvmeRequest *req)
3983 {
3984 NvmeZone *next;
3985 uint16_t status = NVME_SUCCESS;
3986 int i;
3987
3988 if (!proc_mask) {
3989 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3990 } else {
3991 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3992 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3993 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3994 req);
3995 if (status && status != NVME_NO_COMPLETE) {
3996 goto out;
3997 }
3998 }
3999 }
4000 if (proc_mask & NVME_PROC_OPENED_ZONES) {
4001 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
4002 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4003 req);
4004 if (status && status != NVME_NO_COMPLETE) {
4005 goto out;
4006 }
4007 }
4008
4009 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
4010 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4011 req);
4012 if (status && status != NVME_NO_COMPLETE) {
4013 goto out;
4014 }
4015 }
4016 }
4017 if (proc_mask & NVME_PROC_FULL_ZONES) {
4018 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
4019 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4020 req);
4021 if (status && status != NVME_NO_COMPLETE) {
4022 goto out;
4023 }
4024 }
4025 }
4026
4027 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
4028 for (i = 0; i < ns->num_zones; i++, zone++) {
4029 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4030 req);
4031 if (status && status != NVME_NO_COMPLETE) {
4032 goto out;
4033 }
4034 }
4035 }
4036 }
4037
4038 out:
4039 return status;
4040 }
4041
4042 typedef struct NvmeZoneResetAIOCB {
4043 BlockAIOCB common;
4044 BlockAIOCB *aiocb;
4045 NvmeRequest *req;
4046 int ret;
4047
4048 bool all;
4049 int idx;
4050 NvmeZone *zone;
4051 } NvmeZoneResetAIOCB;
4052
nvme_zone_reset_cancel(BlockAIOCB * aiocb)4053 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
4054 {
4055 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
4056 NvmeRequest *req = iocb->req;
4057 NvmeNamespace *ns = req->ns;
4058
4059 iocb->idx = ns->num_zones;
4060
4061 iocb->ret = -ECANCELED;
4062
4063 if (iocb->aiocb) {
4064 blk_aio_cancel_async(iocb->aiocb);
4065 iocb->aiocb = NULL;
4066 }
4067 }
4068
4069 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
4070 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
4071 .cancel_async = nvme_zone_reset_cancel,
4072 };
4073
4074 static void nvme_zone_reset_cb(void *opaque, int ret);
4075
nvme_zone_reset_epilogue_cb(void * opaque,int ret)4076 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
4077 {
4078 NvmeZoneResetAIOCB *iocb = opaque;
4079 NvmeRequest *req = iocb->req;
4080 NvmeNamespace *ns = req->ns;
4081 int64_t moff;
4082 int count;
4083
4084 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
4085 goto out;
4086 }
4087
4088 moff = nvme_moff(ns, iocb->zone->d.zslba);
4089 count = nvme_m2b(ns, ns->zone_size);
4090
4091 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
4092 BDRV_REQ_MAY_UNMAP,
4093 nvme_zone_reset_cb, iocb);
4094 return;
4095
4096 out:
4097 nvme_zone_reset_cb(iocb, ret);
4098 }
4099
nvme_zone_reset_cb(void * opaque,int ret)4100 static void nvme_zone_reset_cb(void *opaque, int ret)
4101 {
4102 NvmeZoneResetAIOCB *iocb = opaque;
4103 NvmeRequest *req = iocb->req;
4104 NvmeNamespace *ns = req->ns;
4105
4106 if (iocb->ret < 0) {
4107 goto done;
4108 } else if (ret < 0) {
4109 iocb->ret = ret;
4110 goto done;
4111 }
4112
4113 if (iocb->zone) {
4114 nvme_zrm_reset(ns, iocb->zone);
4115
4116 if (!iocb->all) {
4117 goto done;
4118 }
4119 }
4120
4121 while (iocb->idx < ns->num_zones) {
4122 NvmeZone *zone = &ns->zone_array[iocb->idx++];
4123
4124 switch (nvme_get_zone_state(zone)) {
4125 case NVME_ZONE_STATE_EMPTY:
4126 if (!iocb->all) {
4127 goto done;
4128 }
4129
4130 continue;
4131
4132 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
4133 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
4134 case NVME_ZONE_STATE_CLOSED:
4135 case NVME_ZONE_STATE_FULL:
4136 iocb->zone = zone;
4137 break;
4138
4139 default:
4140 continue;
4141 }
4142
4143 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
4144
4145 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
4146 nvme_l2b(ns, zone->d.zslba),
4147 nvme_l2b(ns, ns->zone_size),
4148 BDRV_REQ_MAY_UNMAP,
4149 nvme_zone_reset_epilogue_cb,
4150 iocb);
4151 return;
4152 }
4153
4154 done:
4155 iocb->aiocb = NULL;
4156
4157 iocb->common.cb(iocb->common.opaque, iocb->ret);
4158 qemu_aio_unref(iocb);
4159 }
4160
nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl * n,NvmeZone * zone,uint64_t elba,NvmeRequest * req)4161 static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone,
4162 uint64_t elba, NvmeRequest *req)
4163 {
4164 NvmeNamespace *ns = req->ns;
4165 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
4166 uint64_t wp = zone->d.wp;
4167 uint32_t nlb = elba - wp + 1;
4168 uint16_t status;
4169
4170
4171 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
4172 return NVME_INVALID_ZONE_OP | NVME_DNR;
4173 }
4174
4175 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
4176 return NVME_INVALID_FIELD | NVME_DNR;
4177 }
4178
4179 if (elba < wp || elba > wp + ns->zns.zrwas) {
4180 return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
4181 }
4182
4183 if (nlb % ns->zns.zrwafg) {
4184 return NVME_INVALID_FIELD | NVME_DNR;
4185 }
4186
4187 status = nvme_zrm_auto(n, ns, zone);
4188 if (status) {
4189 return status;
4190 }
4191
4192 zone->w_ptr += nlb;
4193
4194 nvme_advance_zone_wp(ns, zone, nlb);
4195
4196 return NVME_SUCCESS;
4197 }
4198
nvme_zone_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4199 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4200 {
4201 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
4202 NvmeNamespace *ns = req->ns;
4203 NvmeZone *zone;
4204 NvmeZoneResetAIOCB *iocb;
4205 uint8_t *zd_ext;
4206 uint64_t slba = 0;
4207 uint32_t zone_idx = 0;
4208 uint16_t status;
4209 uint8_t action = cmd->zsa;
4210 bool all;
4211 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
4212
4213 all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
4214
4215 req->status = NVME_SUCCESS;
4216
4217 if (!all) {
4218 status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
4219 if (status) {
4220 return status;
4221 }
4222 }
4223
4224 zone = &ns->zone_array[zone_idx];
4225 if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) {
4226 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
4227 return NVME_INVALID_FIELD | NVME_DNR;
4228 }
4229
4230 switch (action) {
4231
4232 case NVME_ZONE_ACTION_OPEN:
4233 if (all) {
4234 proc_mask = NVME_PROC_CLOSED_ZONES;
4235 }
4236 trace_pci_nvme_open_zone(slba, zone_idx, all);
4237 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
4238 break;
4239
4240 case NVME_ZONE_ACTION_CLOSE:
4241 if (all) {
4242 proc_mask = NVME_PROC_OPENED_ZONES;
4243 }
4244 trace_pci_nvme_close_zone(slba, zone_idx, all);
4245 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
4246 break;
4247
4248 case NVME_ZONE_ACTION_FINISH:
4249 if (all) {
4250 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
4251 }
4252 trace_pci_nvme_finish_zone(slba, zone_idx, all);
4253 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
4254 break;
4255
4256 case NVME_ZONE_ACTION_RESET:
4257 trace_pci_nvme_reset_zone(slba, zone_idx, all);
4258
4259 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
4260 nvme_misc_cb, req);
4261
4262 iocb->req = req;
4263 iocb->ret = 0;
4264 iocb->all = all;
4265 iocb->idx = zone_idx;
4266 iocb->zone = NULL;
4267
4268 req->aiocb = &iocb->common;
4269 nvme_zone_reset_cb(iocb, 0);
4270
4271 return NVME_NO_COMPLETE;
4272
4273 case NVME_ZONE_ACTION_OFFLINE:
4274 if (all) {
4275 proc_mask = NVME_PROC_READ_ONLY_ZONES;
4276 }
4277 trace_pci_nvme_offline_zone(slba, zone_idx, all);
4278 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
4279 break;
4280
4281 case NVME_ZONE_ACTION_SET_ZD_EXT:
4282 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
4283 if (all || !ns->params.zd_extension_size) {
4284 return NVME_INVALID_FIELD | NVME_DNR;
4285 }
4286 zd_ext = nvme_get_zd_extension(ns, zone_idx);
4287 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
4288 if (status) {
4289 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
4290 return status;
4291 }
4292
4293 status = nvme_set_zd_ext(ns, zone);
4294 if (status == NVME_SUCCESS) {
4295 trace_pci_nvme_zd_extension_set(zone_idx);
4296 return status;
4297 }
4298 break;
4299
4300 case NVME_ZONE_ACTION_ZRWA_FLUSH:
4301 if (all) {
4302 return NVME_INVALID_FIELD | NVME_DNR;
4303 }
4304
4305 return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req);
4306
4307 default:
4308 trace_pci_nvme_err_invalid_mgmt_action(action);
4309 status = NVME_INVALID_FIELD;
4310 }
4311
4312 if (status == NVME_ZONE_INVAL_TRANSITION) {
4313 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
4314 zone->d.za);
4315 }
4316 if (status) {
4317 status |= NVME_DNR;
4318 }
4319
4320 return status;
4321 }
4322
nvme_zone_matches_filter(uint32_t zafs,NvmeZone * zl)4323 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
4324 {
4325 NvmeZoneState zs = nvme_get_zone_state(zl);
4326
4327 switch (zafs) {
4328 case NVME_ZONE_REPORT_ALL:
4329 return true;
4330 case NVME_ZONE_REPORT_EMPTY:
4331 return zs == NVME_ZONE_STATE_EMPTY;
4332 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
4333 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
4334 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
4335 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
4336 case NVME_ZONE_REPORT_CLOSED:
4337 return zs == NVME_ZONE_STATE_CLOSED;
4338 case NVME_ZONE_REPORT_FULL:
4339 return zs == NVME_ZONE_STATE_FULL;
4340 case NVME_ZONE_REPORT_READ_ONLY:
4341 return zs == NVME_ZONE_STATE_READ_ONLY;
4342 case NVME_ZONE_REPORT_OFFLINE:
4343 return zs == NVME_ZONE_STATE_OFFLINE;
4344 default:
4345 return false;
4346 }
4347 }
4348
nvme_zone_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4349 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4350 {
4351 NvmeCmd *cmd = &req->cmd;
4352 NvmeNamespace *ns = req->ns;
4353 /* cdw12 is zero-based number of dwords to return. Convert to bytes */
4354 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
4355 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4356 uint32_t zone_idx, zra, zrasf, partial;
4357 uint64_t max_zones, nr_zones = 0;
4358 uint16_t status;
4359 uint64_t slba;
4360 NvmeZoneDescr *z;
4361 NvmeZone *zone;
4362 NvmeZoneReportHeader *header;
4363 void *buf, *buf_p;
4364 size_t zone_entry_sz;
4365 int i;
4366
4367 req->status = NVME_SUCCESS;
4368
4369 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
4370 if (status) {
4371 return status;
4372 }
4373
4374 zra = dw13 & 0xff;
4375 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
4376 return NVME_INVALID_FIELD | NVME_DNR;
4377 }
4378 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
4379 return NVME_INVALID_FIELD | NVME_DNR;
4380 }
4381
4382 zrasf = (dw13 >> 8) & 0xff;
4383 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
4384 return NVME_INVALID_FIELD | NVME_DNR;
4385 }
4386
4387 if (data_size < sizeof(NvmeZoneReportHeader)) {
4388 return NVME_INVALID_FIELD | NVME_DNR;
4389 }
4390
4391 status = nvme_check_mdts(n, data_size);
4392 if (status) {
4393 return status;
4394 }
4395
4396 partial = (dw13 >> 16) & 0x01;
4397
4398 zone_entry_sz = sizeof(NvmeZoneDescr);
4399 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4400 zone_entry_sz += ns->params.zd_extension_size;
4401 }
4402
4403 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
4404 buf = g_malloc0(data_size);
4405
4406 zone = &ns->zone_array[zone_idx];
4407 for (i = zone_idx; i < ns->num_zones; i++) {
4408 if (partial && nr_zones >= max_zones) {
4409 break;
4410 }
4411 if (nvme_zone_matches_filter(zrasf, zone++)) {
4412 nr_zones++;
4413 }
4414 }
4415 header = buf;
4416 header->nr_zones = cpu_to_le64(nr_zones);
4417
4418 buf_p = buf + sizeof(NvmeZoneReportHeader);
4419 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
4420 zone = &ns->zone_array[zone_idx];
4421 if (nvme_zone_matches_filter(zrasf, zone)) {
4422 z = buf_p;
4423 buf_p += sizeof(NvmeZoneDescr);
4424
4425 z->zt = zone->d.zt;
4426 z->zs = zone->d.zs;
4427 z->zcap = cpu_to_le64(zone->d.zcap);
4428 z->zslba = cpu_to_le64(zone->d.zslba);
4429 z->za = zone->d.za;
4430
4431 if (nvme_wp_is_valid(zone)) {
4432 z->wp = cpu_to_le64(zone->d.wp);
4433 } else {
4434 z->wp = cpu_to_le64(~0ULL);
4435 }
4436
4437 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4438 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
4439 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
4440 ns->params.zd_extension_size);
4441 }
4442 buf_p += ns->params.zd_extension_size;
4443 }
4444
4445 max_zones--;
4446 }
4447 }
4448
4449 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
4450
4451 g_free(buf);
4452
4453 return status;
4454 }
4455
nvme_io_mgmt_recv_ruhs(NvmeCtrl * n,NvmeRequest * req,size_t len)4456 static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
4457 size_t len)
4458 {
4459 NvmeNamespace *ns = req->ns;
4460 NvmeEnduranceGroup *endgrp;
4461 NvmeRuhStatus *hdr;
4462 NvmeRuhStatusDescr *ruhsd;
4463 unsigned int nruhsd;
4464 uint16_t rg, ph, *ruhid;
4465 size_t trans_len;
4466 g_autofree uint8_t *buf = NULL;
4467
4468 if (!n->subsys) {
4469 return NVME_INVALID_FIELD | NVME_DNR;
4470 }
4471
4472 if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) {
4473 return NVME_INVALID_NSID | NVME_DNR;
4474 }
4475
4476 if (!n->subsys->endgrp.fdp.enabled) {
4477 return NVME_FDP_DISABLED | NVME_DNR;
4478 }
4479
4480 endgrp = ns->endgrp;
4481
4482 nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
4483 trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
4484 buf = g_malloc0(trans_len);
4485
4486 trans_len = MIN(trans_len, len);
4487
4488 hdr = (NvmeRuhStatus *)buf;
4489 ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus));
4490
4491 hdr->nruhsd = cpu_to_le16(nruhsd);
4492
4493 ruhid = ns->fdp.phs;
4494
4495 for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) {
4496 NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid];
4497
4498 for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) {
4499 uint16_t pid = nvme_make_pid(ns, rg, ph);
4500
4501 ruhsd->pid = cpu_to_le16(pid);
4502 ruhsd->ruhid = *ruhid;
4503 ruhsd->earutr = 0;
4504 ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw);
4505 }
4506 }
4507
4508 return nvme_c2h(n, buf, trans_len, req);
4509 }
4510
nvme_io_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4511 static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4512 {
4513 NvmeCmd *cmd = &req->cmd;
4514 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4515 uint32_t numd = le32_to_cpu(cmd->cdw11);
4516 uint8_t mo = (cdw10 & 0xff);
4517 size_t len = (numd + 1) << 2;
4518
4519 switch (mo) {
4520 case NVME_IOMR_MO_NOP:
4521 return 0;
4522 case NVME_IOMR_MO_RUH_STATUS:
4523 return nvme_io_mgmt_recv_ruhs(n, req, len);
4524 default:
4525 return NVME_INVALID_FIELD | NVME_DNR;
4526 };
4527 }
4528
nvme_io_mgmt_send_ruh_update(NvmeCtrl * n,NvmeRequest * req)4529 static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
4530 {
4531 NvmeCmd *cmd = &req->cmd;
4532 NvmeNamespace *ns = req->ns;
4533 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4534 uint16_t ret = NVME_SUCCESS;
4535 uint32_t npid = (cdw10 >> 16) + 1;
4536 unsigned int i = 0;
4537 g_autofree uint16_t *pids = NULL;
4538 uint32_t maxnpid;
4539
4540 if (!ns->endgrp || !ns->endgrp->fdp.enabled) {
4541 return NVME_FDP_DISABLED | NVME_DNR;
4542 }
4543
4544 maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
4545
4546 if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
4547 return NVME_INVALID_FIELD | NVME_DNR;
4548 }
4549
4550 pids = g_new(uint16_t, npid);
4551
4552 ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req);
4553 if (ret) {
4554 return ret;
4555 }
4556
4557 for (; i < npid; i++) {
4558 if (!nvme_update_ruh(n, ns, pids[i])) {
4559 return NVME_INVALID_FIELD | NVME_DNR;
4560 }
4561 }
4562
4563 return ret;
4564 }
4565
nvme_io_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4566 static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4567 {
4568 NvmeCmd *cmd = &req->cmd;
4569 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4570 uint8_t mo = (cdw10 & 0xff);
4571
4572 switch (mo) {
4573 case NVME_IOMS_MO_NOP:
4574 return 0;
4575 case NVME_IOMS_MO_RUH_UPDATE:
4576 return nvme_io_mgmt_send_ruh_update(n, req);
4577 default:
4578 return NVME_INVALID_FIELD | NVME_DNR;
4579 };
4580 }
4581
nvme_io_cmd(NvmeCtrl * n,NvmeRequest * req)4582 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
4583 {
4584 NvmeNamespace *ns;
4585 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4586
4587 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
4588 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
4589
4590 /*
4591 * In the base NVM command set, Flush may apply to all namespaces
4592 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
4593 * along with TP 4056 (Namespace Types), it may be pretty screwed up.
4594 *
4595 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
4596 * opcode with a specific command since we cannot determine a unique I/O
4597 * command set. Opcode 0h could have any other meaning than something
4598 * equivalent to flushing and say it DOES have completely different
4599 * semantics in some other command set - does an NSID of FFFFFFFFh then
4600 * mean "for all namespaces, apply whatever command set specific command
4601 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
4602 * whatever command that uses the 0h opcode if, and only if, it allows NSID
4603 * to be FFFFFFFFh"?
4604 *
4605 * Anyway (and luckily), for now, we do not care about this since the
4606 * device only supports namespace types that includes the NVM Flush command
4607 * (NVM and Zoned), so always do an NVM Flush.
4608 */
4609
4610 if (req->cmd.opcode == NVME_CMD_FLUSH) {
4611 return nvme_flush(n, req);
4612 }
4613
4614 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4615 return NVME_INVALID_NSID | NVME_DNR;
4616 }
4617
4618 ns = nvme_ns(n, nsid);
4619 if (unlikely(!ns)) {
4620 return NVME_INVALID_FIELD | NVME_DNR;
4621 }
4622
4623 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4624 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4625 return NVME_INVALID_OPCODE | NVME_DNR;
4626 }
4627
4628 if (ns->status) {
4629 return ns->status;
4630 }
4631
4632 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
4633 return NVME_INVALID_FIELD;
4634 }
4635
4636 req->ns = ns;
4637
4638 switch (req->cmd.opcode) {
4639 case NVME_CMD_WRITE_ZEROES:
4640 return nvme_write_zeroes(n, req);
4641 case NVME_CMD_ZONE_APPEND:
4642 return nvme_zone_append(n, req);
4643 case NVME_CMD_WRITE:
4644 return nvme_write(n, req);
4645 case NVME_CMD_READ:
4646 return nvme_read(n, req);
4647 case NVME_CMD_COMPARE:
4648 return nvme_compare(n, req);
4649 case NVME_CMD_DSM:
4650 return nvme_dsm(n, req);
4651 case NVME_CMD_VERIFY:
4652 return nvme_verify(n, req);
4653 case NVME_CMD_COPY:
4654 return nvme_copy(n, req);
4655 case NVME_CMD_ZONE_MGMT_SEND:
4656 return nvme_zone_mgmt_send(n, req);
4657 case NVME_CMD_ZONE_MGMT_RECV:
4658 return nvme_zone_mgmt_recv(n, req);
4659 case NVME_CMD_IO_MGMT_RECV:
4660 return nvme_io_mgmt_recv(n, req);
4661 case NVME_CMD_IO_MGMT_SEND:
4662 return nvme_io_mgmt_send(n, req);
4663 default:
4664 assert(false);
4665 }
4666
4667 return NVME_INVALID_OPCODE | NVME_DNR;
4668 }
4669
nvme_cq_notifier(EventNotifier * e)4670 static void nvme_cq_notifier(EventNotifier *e)
4671 {
4672 NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
4673 NvmeCtrl *n = cq->ctrl;
4674
4675 if (!event_notifier_test_and_clear(e)) {
4676 return;
4677 }
4678
4679 nvme_update_cq_head(cq);
4680
4681 if (cq->tail == cq->head) {
4682 if (cq->irq_enabled) {
4683 n->cq_pending--;
4684 }
4685
4686 nvme_irq_deassert(n, cq);
4687 }
4688
4689 qemu_bh_schedule(cq->bh);
4690 }
4691
nvme_init_cq_ioeventfd(NvmeCQueue * cq)4692 static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
4693 {
4694 NvmeCtrl *n = cq->ctrl;
4695 uint16_t offset = (cq->cqid << 3) + (1 << 2);
4696 int ret;
4697
4698 ret = event_notifier_init(&cq->notifier, 0);
4699 if (ret < 0) {
4700 return ret;
4701 }
4702
4703 event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
4704 memory_region_add_eventfd(&n->iomem,
4705 0x1000 + offset, 4, false, 0, &cq->notifier);
4706
4707 return 0;
4708 }
4709
nvme_sq_notifier(EventNotifier * e)4710 static void nvme_sq_notifier(EventNotifier *e)
4711 {
4712 NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
4713
4714 if (!event_notifier_test_and_clear(e)) {
4715 return;
4716 }
4717
4718 nvme_process_sq(sq);
4719 }
4720
nvme_init_sq_ioeventfd(NvmeSQueue * sq)4721 static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
4722 {
4723 NvmeCtrl *n = sq->ctrl;
4724 uint16_t offset = sq->sqid << 3;
4725 int ret;
4726
4727 ret = event_notifier_init(&sq->notifier, 0);
4728 if (ret < 0) {
4729 return ret;
4730 }
4731
4732 event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
4733 memory_region_add_eventfd(&n->iomem,
4734 0x1000 + offset, 4, false, 0, &sq->notifier);
4735
4736 return 0;
4737 }
4738
nvme_free_sq(NvmeSQueue * sq,NvmeCtrl * n)4739 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
4740 {
4741 uint16_t offset = sq->sqid << 3;
4742
4743 n->sq[sq->sqid] = NULL;
4744 qemu_bh_delete(sq->bh);
4745 if (sq->ioeventfd_enabled) {
4746 memory_region_del_eventfd(&n->iomem,
4747 0x1000 + offset, 4, false, 0, &sq->notifier);
4748 event_notifier_set_handler(&sq->notifier, NULL);
4749 event_notifier_cleanup(&sq->notifier);
4750 }
4751 g_free(sq->io_req);
4752 if (sq->sqid) {
4753 g_free(sq);
4754 }
4755 }
4756
nvme_del_sq(NvmeCtrl * n,NvmeRequest * req)4757 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
4758 {
4759 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4760 NvmeRequest *r, *next;
4761 NvmeSQueue *sq;
4762 NvmeCQueue *cq;
4763 uint16_t qid = le16_to_cpu(c->qid);
4764
4765 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
4766 trace_pci_nvme_err_invalid_del_sq(qid);
4767 return NVME_INVALID_QID | NVME_DNR;
4768 }
4769
4770 trace_pci_nvme_del_sq(qid);
4771
4772 sq = n->sq[qid];
4773 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
4774 r = QTAILQ_FIRST(&sq->out_req_list);
4775 assert(r->aiocb);
4776 blk_aio_cancel(r->aiocb);
4777 }
4778
4779 assert(QTAILQ_EMPTY(&sq->out_req_list));
4780
4781 if (!nvme_check_cqid(n, sq->cqid)) {
4782 cq = n->cq[sq->cqid];
4783 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
4784
4785 nvme_post_cqes(cq);
4786 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
4787 if (r->sq == sq) {
4788 QTAILQ_REMOVE(&cq->req_list, r, entry);
4789 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
4790 }
4791 }
4792 }
4793
4794 nvme_free_sq(sq, n);
4795 return NVME_SUCCESS;
4796 }
4797
nvme_init_sq(NvmeSQueue * sq,NvmeCtrl * n,uint64_t dma_addr,uint16_t sqid,uint16_t cqid,uint16_t size)4798 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
4799 uint16_t sqid, uint16_t cqid, uint16_t size)
4800 {
4801 int i;
4802 NvmeCQueue *cq;
4803
4804 sq->ctrl = n;
4805 sq->dma_addr = dma_addr;
4806 sq->sqid = sqid;
4807 sq->size = size;
4808 sq->cqid = cqid;
4809 sq->head = sq->tail = 0;
4810 sq->io_req = g_new0(NvmeRequest, sq->size);
4811
4812 QTAILQ_INIT(&sq->req_list);
4813 QTAILQ_INIT(&sq->out_req_list);
4814 for (i = 0; i < sq->size; i++) {
4815 sq->io_req[i].sq = sq;
4816 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4817 }
4818
4819 sq->bh = qemu_bh_new_guarded(nvme_process_sq, sq,
4820 &DEVICE(sq->ctrl)->mem_reentrancy_guard);
4821
4822 if (n->dbbuf_enabled) {
4823 sq->db_addr = n->dbbuf_dbs + (sqid << 3);
4824 sq->ei_addr = n->dbbuf_eis + (sqid << 3);
4825
4826 if (n->params.ioeventfd && sq->sqid != 0) {
4827 if (!nvme_init_sq_ioeventfd(sq)) {
4828 sq->ioeventfd_enabled = true;
4829 }
4830 }
4831 }
4832
4833 assert(n->cq[cqid]);
4834 cq = n->cq[cqid];
4835 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4836 n->sq[sqid] = sq;
4837 }
4838
nvme_create_sq(NvmeCtrl * n,NvmeRequest * req)4839 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4840 {
4841 NvmeSQueue *sq;
4842 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4843
4844 uint16_t cqid = le16_to_cpu(c->cqid);
4845 uint16_t sqid = le16_to_cpu(c->sqid);
4846 uint16_t qsize = le16_to_cpu(c->qsize);
4847 uint16_t qflags = le16_to_cpu(c->sq_flags);
4848 uint64_t prp1 = le64_to_cpu(c->prp1);
4849
4850 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4851
4852 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4853 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4854 return NVME_INVALID_CQID | NVME_DNR;
4855 }
4856 if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
4857 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4858 return NVME_INVALID_QID | NVME_DNR;
4859 }
4860 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4861 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4862 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4863 }
4864 if (unlikely(prp1 & (n->page_size - 1))) {
4865 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4866 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4867 }
4868 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4869 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4870 return NVME_INVALID_FIELD | NVME_DNR;
4871 }
4872 sq = g_malloc0(sizeof(*sq));
4873 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4874 return NVME_SUCCESS;
4875 }
4876
4877 struct nvme_stats {
4878 uint64_t units_read;
4879 uint64_t units_written;
4880 uint64_t read_commands;
4881 uint64_t write_commands;
4882 };
4883
nvme_set_blk_stats(NvmeNamespace * ns,struct nvme_stats * stats)4884 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4885 {
4886 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4887
4888 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
4889 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
4890 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4891 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4892 }
4893
nvme_smart_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4894 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4895 uint64_t off, NvmeRequest *req)
4896 {
4897 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4898 struct nvme_stats stats = { 0 };
4899 NvmeSmartLog smart = { 0 };
4900 uint32_t trans_len;
4901 NvmeNamespace *ns;
4902 time_t current_ms;
4903 uint64_t u_read, u_written;
4904
4905 if (off >= sizeof(smart)) {
4906 return NVME_INVALID_FIELD | NVME_DNR;
4907 }
4908
4909 if (nsid != 0xffffffff) {
4910 ns = nvme_ns(n, nsid);
4911 if (!ns) {
4912 return NVME_INVALID_NSID | NVME_DNR;
4913 }
4914 nvme_set_blk_stats(ns, &stats);
4915 } else {
4916 int i;
4917
4918 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4919 ns = nvme_ns(n, i);
4920 if (!ns) {
4921 continue;
4922 }
4923 nvme_set_blk_stats(ns, &stats);
4924 }
4925 }
4926
4927 trans_len = MIN(sizeof(smart) - off, buf_len);
4928 smart.critical_warning = n->smart_critical_warning;
4929
4930 u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
4931 u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
4932
4933 smart.data_units_read[0] = cpu_to_le64(u_read);
4934 smart.data_units_written[0] = cpu_to_le64(u_written);
4935 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4936 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4937
4938 smart.temperature = cpu_to_le16(n->temperature);
4939
4940 if ((n->temperature >= n->features.temp_thresh_hi) ||
4941 (n->temperature <= n->features.temp_thresh_low)) {
4942 smart.critical_warning |= NVME_SMART_TEMPERATURE;
4943 }
4944
4945 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4946 smart.power_on_hours[0] =
4947 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4948
4949 if (!rae) {
4950 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4951 }
4952
4953 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4954 }
4955
nvme_endgrp_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4956 static uint16_t nvme_endgrp_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4957 uint64_t off, NvmeRequest *req)
4958 {
4959 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
4960 uint16_t endgrpid = (dw11 >> 16) & 0xffff;
4961 struct nvme_stats stats = {};
4962 NvmeEndGrpLog info = {};
4963 int i;
4964
4965 if (!n->subsys || endgrpid != 0x1) {
4966 return NVME_INVALID_FIELD | NVME_DNR;
4967 }
4968
4969 if (off >= sizeof(info)) {
4970 return NVME_INVALID_FIELD | NVME_DNR;
4971 }
4972
4973 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4974 NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
4975 if (!ns) {
4976 continue;
4977 }
4978
4979 nvme_set_blk_stats(ns, &stats);
4980 }
4981
4982 info.data_units_read[0] =
4983 cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000));
4984 info.data_units_written[0] =
4985 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
4986 info.media_units_written[0] =
4987 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
4988
4989 info.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4990 info.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4991
4992 buf_len = MIN(sizeof(info) - off, buf_len);
4993
4994 return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req);
4995 }
4996
4997
nvme_fw_log_info(NvmeCtrl * n,uint32_t buf_len,uint64_t off,NvmeRequest * req)4998 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4999 NvmeRequest *req)
5000 {
5001 uint32_t trans_len;
5002 NvmeFwSlotInfoLog fw_log = {
5003 .afi = 0x1,
5004 };
5005
5006 if (off >= sizeof(fw_log)) {
5007 return NVME_INVALID_FIELD | NVME_DNR;
5008 }
5009
5010 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
5011 trans_len = MIN(sizeof(fw_log) - off, buf_len);
5012
5013 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
5014 }
5015
nvme_error_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5016 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5017 uint64_t off, NvmeRequest *req)
5018 {
5019 uint32_t trans_len;
5020 NvmeErrorLog errlog;
5021
5022 if (off >= sizeof(errlog)) {
5023 return NVME_INVALID_FIELD | NVME_DNR;
5024 }
5025
5026 if (!rae) {
5027 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
5028 }
5029
5030 memset(&errlog, 0x0, sizeof(errlog));
5031 trans_len = MIN(sizeof(errlog) - off, buf_len);
5032
5033 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
5034 }
5035
nvme_changed_nslist(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5036 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5037 uint64_t off, NvmeRequest *req)
5038 {
5039 uint32_t nslist[1024];
5040 uint32_t trans_len;
5041 int i = 0;
5042 uint32_t nsid;
5043
5044 if (off >= sizeof(nslist)) {
5045 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
5046 return NVME_INVALID_FIELD | NVME_DNR;
5047 }
5048
5049 memset(nslist, 0x0, sizeof(nslist));
5050 trans_len = MIN(sizeof(nslist) - off, buf_len);
5051
5052 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
5053 NVME_CHANGED_NSID_SIZE) {
5054 /*
5055 * If more than 1024 namespaces, the first entry in the log page should
5056 * be set to FFFFFFFFh and the others to 0 as spec.
5057 */
5058 if (i == ARRAY_SIZE(nslist)) {
5059 memset(nslist, 0x0, sizeof(nslist));
5060 nslist[0] = 0xffffffff;
5061 break;
5062 }
5063
5064 nslist[i++] = nsid;
5065 clear_bit(nsid, n->changed_nsids);
5066 }
5067
5068 /*
5069 * Remove all the remaining list entries in case returns directly due to
5070 * more than 1024 namespaces.
5071 */
5072 if (nslist[0] == 0xffffffff) {
5073 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
5074 }
5075
5076 if (!rae) {
5077 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
5078 }
5079
5080 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
5081 }
5082
nvme_cmd_effects(NvmeCtrl * n,uint8_t csi,uint32_t buf_len,uint64_t off,NvmeRequest * req)5083 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
5084 uint64_t off, NvmeRequest *req)
5085 {
5086 NvmeEffectsLog log = {};
5087 const uint32_t *src_iocs = NULL;
5088 uint32_t trans_len;
5089
5090 if (off >= sizeof(log)) {
5091 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
5092 return NVME_INVALID_FIELD | NVME_DNR;
5093 }
5094
5095 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
5096 case NVME_CC_CSS_NVM:
5097 src_iocs = nvme_cse_iocs_nvm;
5098 /* fall through */
5099 case NVME_CC_CSS_ADMIN_ONLY:
5100 break;
5101 case NVME_CC_CSS_CSI:
5102 switch (csi) {
5103 case NVME_CSI_NVM:
5104 src_iocs = nvme_cse_iocs_nvm;
5105 break;
5106 case NVME_CSI_ZONED:
5107 src_iocs = nvme_cse_iocs_zoned;
5108 break;
5109 }
5110 }
5111
5112 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
5113
5114 if (src_iocs) {
5115 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
5116 }
5117
5118 trans_len = MIN(sizeof(log) - off, buf_len);
5119
5120 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
5121 }
5122
sizeof_fdp_conf_descr(size_t nruh,size_t vss)5123 static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss)
5124 {
5125 size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr)
5126 + vss;
5127 return ROUND_UP(entry_siz, 8);
5128 }
5129
nvme_fdp_confs(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5130 static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5131 uint64_t off, NvmeRequest *req)
5132 {
5133 uint32_t log_size, trans_len;
5134 g_autofree uint8_t *buf = NULL;
5135 NvmeFdpDescrHdr *hdr;
5136 NvmeRuhDescr *ruhd;
5137 NvmeEnduranceGroup *endgrp;
5138 NvmeFdpConfsHdr *log;
5139 size_t nruh, fdp_descr_size;
5140 int i;
5141
5142 if (endgrpid != 1 || !n->subsys) {
5143 return NVME_INVALID_FIELD | NVME_DNR;
5144 }
5145
5146 endgrp = &n->subsys->endgrp;
5147
5148 if (endgrp->fdp.enabled) {
5149 nruh = endgrp->fdp.nruh;
5150 } else {
5151 nruh = 1;
5152 }
5153
5154 fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS);
5155 log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size;
5156
5157 if (off >= log_size) {
5158 return NVME_INVALID_FIELD | NVME_DNR;
5159 }
5160
5161 trans_len = MIN(log_size - off, buf_len);
5162
5163 buf = g_malloc0(log_size);
5164 log = (NvmeFdpConfsHdr *)buf;
5165 hdr = (NvmeFdpDescrHdr *)(log + 1);
5166 ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr));
5167
5168 log->num_confs = cpu_to_le16(0);
5169 log->size = cpu_to_le32(log_size);
5170
5171 hdr->descr_size = cpu_to_le16(fdp_descr_size);
5172 if (endgrp->fdp.enabled) {
5173 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1);
5174 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif);
5175 hdr->nrg = cpu_to_le16(endgrp->fdp.nrg);
5176 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5177 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5178 hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES);
5179 hdr->runs = cpu_to_le64(endgrp->fdp.runs);
5180
5181 for (i = 0; i < nruh; i++) {
5182 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5183 ruhd++;
5184 }
5185 } else {
5186 /* 1 bit for RUH in PIF -> 2 RUHs max. */
5187 hdr->nrg = cpu_to_le16(1);
5188 hdr->nruh = cpu_to_le16(1);
5189 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5190 hdr->nnss = cpu_to_le32(1);
5191 hdr->runs = cpu_to_le64(96 * MiB);
5192
5193 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5194 }
5195
5196 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5197 }
5198
nvme_fdp_ruh_usage(NvmeCtrl * n,uint32_t endgrpid,uint32_t dw10,uint32_t dw12,uint32_t buf_len,uint64_t off,NvmeRequest * req)5199 static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid,
5200 uint32_t dw10, uint32_t dw12,
5201 uint32_t buf_len, uint64_t off,
5202 NvmeRequest *req)
5203 {
5204 NvmeRuHandle *ruh;
5205 NvmeRuhuLog *hdr;
5206 NvmeRuhuDescr *ruhud;
5207 NvmeEnduranceGroup *endgrp;
5208 g_autofree uint8_t *buf = NULL;
5209 uint32_t log_size, trans_len;
5210 uint16_t i;
5211
5212 if (endgrpid != 1 || !n->subsys) {
5213 return NVME_INVALID_FIELD | NVME_DNR;
5214 }
5215
5216 endgrp = &n->subsys->endgrp;
5217
5218 if (!endgrp->fdp.enabled) {
5219 return NVME_FDP_DISABLED | NVME_DNR;
5220 }
5221
5222 log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr);
5223
5224 if (off >= log_size) {
5225 return NVME_INVALID_FIELD | NVME_DNR;
5226 }
5227
5228 trans_len = MIN(log_size - off, buf_len);
5229
5230 buf = g_malloc0(log_size);
5231 hdr = (NvmeRuhuLog *)buf;
5232 ruhud = (NvmeRuhuDescr *)(hdr + 1);
5233
5234 ruh = endgrp->fdp.ruhs;
5235 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5236
5237 for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) {
5238 ruhud->ruha = ruh->ruha;
5239 }
5240
5241 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5242 }
5243
nvme_fdp_stats(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5244 static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5245 uint64_t off, NvmeRequest *req)
5246 {
5247 NvmeEnduranceGroup *endgrp;
5248 NvmeFdpStatsLog log = {};
5249 uint32_t trans_len;
5250
5251 if (off >= sizeof(NvmeFdpStatsLog)) {
5252 return NVME_INVALID_FIELD | NVME_DNR;
5253 }
5254
5255 if (endgrpid != 1 || !n->subsys) {
5256 return NVME_INVALID_FIELD | NVME_DNR;
5257 }
5258
5259 if (!n->subsys->endgrp.fdp.enabled) {
5260 return NVME_FDP_DISABLED | NVME_DNR;
5261 }
5262
5263 endgrp = &n->subsys->endgrp;
5264
5265 trans_len = MIN(sizeof(log) - off, buf_len);
5266
5267 /* spec value is 128 bit, we only use 64 bit */
5268 log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw);
5269 log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw);
5270 log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe);
5271
5272 return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req);
5273 }
5274
nvme_fdp_events(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5275 static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid,
5276 uint32_t buf_len, uint64_t off,
5277 NvmeRequest *req)
5278 {
5279 NvmeEnduranceGroup *endgrp;
5280 NvmeCmd *cmd = &req->cmd;
5281 bool host_events = (cmd->cdw10 >> 8) & 0x1;
5282 uint32_t log_size, trans_len;
5283 NvmeFdpEventBuffer *ebuf;
5284 g_autofree NvmeFdpEventsLog *elog = NULL;
5285 NvmeFdpEvent *event;
5286
5287 if (endgrpid != 1 || !n->subsys) {
5288 return NVME_INVALID_FIELD | NVME_DNR;
5289 }
5290
5291 endgrp = &n->subsys->endgrp;
5292
5293 if (!endgrp->fdp.enabled) {
5294 return NVME_FDP_DISABLED | NVME_DNR;
5295 }
5296
5297 if (host_events) {
5298 ebuf = &endgrp->fdp.host_events;
5299 } else {
5300 ebuf = &endgrp->fdp.ctrl_events;
5301 }
5302
5303 log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
5304
5305 if (off >= log_size) {
5306 return NVME_INVALID_FIELD | NVME_DNR;
5307 }
5308
5309 trans_len = MIN(log_size - off, buf_len);
5310 elog = g_malloc0(log_size);
5311 elog->num_events = cpu_to_le32(ebuf->nelems);
5312 event = (NvmeFdpEvent *)(elog + 1);
5313
5314 if (ebuf->nelems && ebuf->start == ebuf->next) {
5315 unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start);
5316 /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */
5317 memcpy(event, &ebuf->events[ebuf->start],
5318 sizeof(NvmeFdpEvent) * nelems);
5319 memcpy(event + nelems, ebuf->events,
5320 sizeof(NvmeFdpEvent) * ebuf->next);
5321 } else if (ebuf->start < ebuf->next) {
5322 memcpy(event, &ebuf->events[ebuf->start],
5323 sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start));
5324 }
5325
5326 return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req);
5327 }
5328
nvme_get_log(NvmeCtrl * n,NvmeRequest * req)5329 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
5330 {
5331 NvmeCmd *cmd = &req->cmd;
5332
5333 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5334 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5335 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
5336 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
5337 uint8_t lid = dw10 & 0xff;
5338 uint8_t lsp = (dw10 >> 8) & 0xf;
5339 uint8_t rae = (dw10 >> 15) & 0x1;
5340 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
5341 uint32_t numdl, numdu, lspi;
5342 uint64_t off, lpol, lpou;
5343 size_t len;
5344 uint16_t status;
5345
5346 numdl = (dw10 >> 16);
5347 numdu = (dw11 & 0xffff);
5348 lspi = (dw11 >> 16);
5349 lpol = dw12;
5350 lpou = dw13;
5351
5352 len = (((numdu << 16) | numdl) + 1) << 2;
5353 off = (lpou << 32ULL) | lpol;
5354
5355 if (off & 0x3) {
5356 return NVME_INVALID_FIELD | NVME_DNR;
5357 }
5358
5359 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
5360
5361 status = nvme_check_mdts(n, len);
5362 if (status) {
5363 return status;
5364 }
5365
5366 switch (lid) {
5367 case NVME_LOG_ERROR_INFO:
5368 return nvme_error_info(n, rae, len, off, req);
5369 case NVME_LOG_SMART_INFO:
5370 return nvme_smart_info(n, rae, len, off, req);
5371 case NVME_LOG_FW_SLOT_INFO:
5372 return nvme_fw_log_info(n, len, off, req);
5373 case NVME_LOG_CHANGED_NSLIST:
5374 return nvme_changed_nslist(n, rae, len, off, req);
5375 case NVME_LOG_CMD_EFFECTS:
5376 return nvme_cmd_effects(n, csi, len, off, req);
5377 case NVME_LOG_ENDGRP:
5378 return nvme_endgrp_info(n, rae, len, off, req);
5379 case NVME_LOG_FDP_CONFS:
5380 return nvme_fdp_confs(n, lspi, len, off, req);
5381 case NVME_LOG_FDP_RUH_USAGE:
5382 return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req);
5383 case NVME_LOG_FDP_STATS:
5384 return nvme_fdp_stats(n, lspi, len, off, req);
5385 case NVME_LOG_FDP_EVENTS:
5386 return nvme_fdp_events(n, lspi, len, off, req);
5387 default:
5388 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
5389 return NVME_INVALID_FIELD | NVME_DNR;
5390 }
5391 }
5392
nvme_free_cq(NvmeCQueue * cq,NvmeCtrl * n)5393 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
5394 {
5395 PCIDevice *pci = PCI_DEVICE(n);
5396 uint16_t offset = (cq->cqid << 3) + (1 << 2);
5397
5398 n->cq[cq->cqid] = NULL;
5399 qemu_bh_delete(cq->bh);
5400 if (cq->ioeventfd_enabled) {
5401 memory_region_del_eventfd(&n->iomem,
5402 0x1000 + offset, 4, false, 0, &cq->notifier);
5403 event_notifier_set_handler(&cq->notifier, NULL);
5404 event_notifier_cleanup(&cq->notifier);
5405 }
5406 if (msix_enabled(pci)) {
5407 msix_vector_unuse(pci, cq->vector);
5408 }
5409 if (cq->cqid) {
5410 g_free(cq);
5411 }
5412 }
5413
nvme_del_cq(NvmeCtrl * n,NvmeRequest * req)5414 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
5415 {
5416 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
5417 NvmeCQueue *cq;
5418 uint16_t qid = le16_to_cpu(c->qid);
5419
5420 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
5421 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
5422 return NVME_INVALID_CQID | NVME_DNR;
5423 }
5424
5425 cq = n->cq[qid];
5426 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
5427 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
5428 return NVME_INVALID_QUEUE_DEL;
5429 }
5430
5431 if (cq->irq_enabled && cq->tail != cq->head) {
5432 n->cq_pending--;
5433 }
5434
5435 nvme_irq_deassert(n, cq);
5436 trace_pci_nvme_del_cq(qid);
5437 nvme_free_cq(cq, n);
5438 return NVME_SUCCESS;
5439 }
5440
nvme_init_cq(NvmeCQueue * cq,NvmeCtrl * n,uint64_t dma_addr,uint16_t cqid,uint16_t vector,uint16_t size,uint16_t irq_enabled)5441 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
5442 uint16_t cqid, uint16_t vector, uint16_t size,
5443 uint16_t irq_enabled)
5444 {
5445 PCIDevice *pci = PCI_DEVICE(n);
5446
5447 if (msix_enabled(pci)) {
5448 msix_vector_use(pci, vector);
5449 }
5450 cq->ctrl = n;
5451 cq->cqid = cqid;
5452 cq->size = size;
5453 cq->dma_addr = dma_addr;
5454 cq->phase = 1;
5455 cq->irq_enabled = irq_enabled;
5456 cq->vector = vector;
5457 cq->head = cq->tail = 0;
5458 QTAILQ_INIT(&cq->req_list);
5459 QTAILQ_INIT(&cq->sq_list);
5460 if (n->dbbuf_enabled) {
5461 cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
5462 cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
5463
5464 if (n->params.ioeventfd && cqid != 0) {
5465 if (!nvme_init_cq_ioeventfd(cq)) {
5466 cq->ioeventfd_enabled = true;
5467 }
5468 }
5469 }
5470 n->cq[cqid] = cq;
5471 cq->bh = qemu_bh_new_guarded(nvme_post_cqes, cq,
5472 &DEVICE(cq->ctrl)->mem_reentrancy_guard);
5473 }
5474
nvme_create_cq(NvmeCtrl * n,NvmeRequest * req)5475 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
5476 {
5477 NvmeCQueue *cq;
5478 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
5479 uint16_t cqid = le16_to_cpu(c->cqid);
5480 uint16_t vector = le16_to_cpu(c->irq_vector);
5481 uint16_t qsize = le16_to_cpu(c->qsize);
5482 uint16_t qflags = le16_to_cpu(c->cq_flags);
5483 uint64_t prp1 = le64_to_cpu(c->prp1);
5484 uint32_t cc = ldq_le_p(&n->bar.cc);
5485 uint8_t iocqes = NVME_CC_IOCQES(cc);
5486 uint8_t iosqes = NVME_CC_IOSQES(cc);
5487
5488 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
5489 NVME_CQ_FLAGS_IEN(qflags) != 0);
5490
5491 if (iosqes != NVME_SQES || iocqes != NVME_CQES) {
5492 trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes);
5493 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5494 }
5495
5496 if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
5497 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
5498 return NVME_INVALID_QID | NVME_DNR;
5499 }
5500 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
5501 trace_pci_nvme_err_invalid_create_cq_size(qsize);
5502 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5503 }
5504 if (unlikely(prp1 & (n->page_size - 1))) {
5505 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
5506 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
5507 }
5508 if (unlikely(!msix_enabled(PCI_DEVICE(n)) && vector)) {
5509 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5510 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5511 }
5512 if (unlikely(vector >= n->conf_msix_qsize)) {
5513 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5514 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5515 }
5516 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
5517 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
5518 return NVME_INVALID_FIELD | NVME_DNR;
5519 }
5520
5521 cq = g_malloc0(sizeof(*cq));
5522 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
5523 NVME_CQ_FLAGS_IEN(qflags));
5524
5525 /*
5526 * It is only required to set qs_created when creating a completion queue;
5527 * creating a submission queue without a matching completion queue will
5528 * fail.
5529 */
5530 n->qs_created = true;
5531 return NVME_SUCCESS;
5532 }
5533
nvme_rpt_empty_id_struct(NvmeCtrl * n,NvmeRequest * req)5534 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
5535 {
5536 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5537
5538 return nvme_c2h(n, id, sizeof(id), req);
5539 }
5540
nvme_identify_ctrl(NvmeCtrl * n,NvmeRequest * req)5541 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
5542 {
5543 trace_pci_nvme_identify_ctrl();
5544
5545 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
5546 }
5547
nvme_identify_ctrl_csi(NvmeCtrl * n,NvmeRequest * req)5548 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
5549 {
5550 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5551 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5552 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
5553
5554 trace_pci_nvme_identify_ctrl_csi(c->csi);
5555
5556 switch (c->csi) {
5557 case NVME_CSI_NVM:
5558 id_nvm->vsl = n->params.vsl;
5559 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
5560 break;
5561
5562 case NVME_CSI_ZONED:
5563 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
5564 break;
5565
5566 default:
5567 return NVME_INVALID_FIELD | NVME_DNR;
5568 }
5569
5570 return nvme_c2h(n, id, sizeof(id), req);
5571 }
5572
nvme_identify_ns(NvmeCtrl * n,NvmeRequest * req,bool active)5573 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
5574 {
5575 NvmeNamespace *ns;
5576 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5577 uint32_t nsid = le32_to_cpu(c->nsid);
5578
5579 trace_pci_nvme_identify_ns(nsid);
5580
5581 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5582 return NVME_INVALID_NSID | NVME_DNR;
5583 }
5584
5585 ns = nvme_ns(n, nsid);
5586 if (unlikely(!ns)) {
5587 if (!active) {
5588 ns = nvme_subsys_ns(n->subsys, nsid);
5589 if (!ns) {
5590 return nvme_rpt_empty_id_struct(n, req);
5591 }
5592 } else {
5593 return nvme_rpt_empty_id_struct(n, req);
5594 }
5595 }
5596
5597 if (active || ns->csi == NVME_CSI_NVM) {
5598 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
5599 }
5600
5601 return NVME_INVALID_CMD_SET | NVME_DNR;
5602 }
5603
nvme_identify_ctrl_list(NvmeCtrl * n,NvmeRequest * req,bool attached)5604 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
5605 bool attached)
5606 {
5607 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5608 uint32_t nsid = le32_to_cpu(c->nsid);
5609 uint16_t min_id = le16_to_cpu(c->ctrlid);
5610 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5611 uint16_t *ids = &list[1];
5612 NvmeNamespace *ns;
5613 NvmeCtrl *ctrl;
5614 int cntlid, nr_ids = 0;
5615
5616 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
5617
5618 if (!n->subsys) {
5619 return NVME_INVALID_FIELD | NVME_DNR;
5620 }
5621
5622 if (attached) {
5623 if (nsid == NVME_NSID_BROADCAST) {
5624 return NVME_INVALID_FIELD | NVME_DNR;
5625 }
5626
5627 ns = nvme_subsys_ns(n->subsys, nsid);
5628 if (!ns) {
5629 return NVME_INVALID_FIELD | NVME_DNR;
5630 }
5631 }
5632
5633 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
5634 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
5635 if (!ctrl) {
5636 continue;
5637 }
5638
5639 if (attached && !nvme_ns(ctrl, nsid)) {
5640 continue;
5641 }
5642
5643 ids[nr_ids++] = cntlid;
5644 }
5645
5646 list[0] = nr_ids;
5647
5648 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
5649 }
5650
nvme_identify_pri_ctrl_cap(NvmeCtrl * n,NvmeRequest * req)5651 static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
5652 {
5653 trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
5654
5655 return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap,
5656 sizeof(NvmePriCtrlCap), req);
5657 }
5658
nvme_identify_sec_ctrl_list(NvmeCtrl * n,NvmeRequest * req)5659 static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
5660 {
5661 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5662 uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
5663 uint16_t min_id = le16_to_cpu(c->ctrlid);
5664 uint8_t num_sec_ctrl = n->nr_sec_ctrls;
5665 NvmeSecCtrlList list = {0};
5666 uint8_t i;
5667
5668 for (i = 0; i < num_sec_ctrl; i++) {
5669 if (n->sec_ctrl_list[i].scid >= min_id) {
5670 list.numcntl = MIN(num_sec_ctrl - i, 127);
5671 memcpy(&list.sec, n->sec_ctrl_list + i,
5672 list.numcntl * sizeof(NvmeSecCtrlEntry));
5673 break;
5674 }
5675 }
5676
5677 trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
5678
5679 return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req);
5680 }
5681
nvme_identify_ns_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5682 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
5683 bool active)
5684 {
5685 NvmeNamespace *ns;
5686 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5687 uint32_t nsid = le32_to_cpu(c->nsid);
5688
5689 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
5690
5691 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5692 return NVME_INVALID_NSID | NVME_DNR;
5693 }
5694
5695 ns = nvme_ns(n, nsid);
5696 if (unlikely(!ns)) {
5697 if (!active) {
5698 ns = nvme_subsys_ns(n->subsys, nsid);
5699 if (!ns) {
5700 return nvme_rpt_empty_id_struct(n, req);
5701 }
5702 } else {
5703 return nvme_rpt_empty_id_struct(n, req);
5704 }
5705 }
5706
5707 if (c->csi == NVME_CSI_NVM) {
5708 return nvme_c2h(n, (uint8_t *)&ns->id_ns_nvm, sizeof(NvmeIdNsNvm),
5709 req);
5710 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
5711 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
5712 req);
5713 }
5714
5715 return NVME_INVALID_FIELD | NVME_DNR;
5716 }
5717
nvme_identify_nslist(NvmeCtrl * n,NvmeRequest * req,bool active)5718 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
5719 bool active)
5720 {
5721 NvmeNamespace *ns;
5722 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5723 uint32_t min_nsid = le32_to_cpu(c->nsid);
5724 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5725 static const int data_len = sizeof(list);
5726 uint32_t *list_ptr = (uint32_t *)list;
5727 int i, j = 0;
5728
5729 trace_pci_nvme_identify_nslist(min_nsid);
5730
5731 /*
5732 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
5733 * since the Active Namespace ID List should return namespaces with ids
5734 * *higher* than the NSID specified in the command. This is also specified
5735 * in the spec (NVM Express v1.3d, Section 5.15.4).
5736 */
5737 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5738 return NVME_INVALID_NSID | NVME_DNR;
5739 }
5740
5741 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5742 ns = nvme_ns(n, i);
5743 if (!ns) {
5744 if (!active) {
5745 ns = nvme_subsys_ns(n->subsys, i);
5746 if (!ns) {
5747 continue;
5748 }
5749 } else {
5750 continue;
5751 }
5752 }
5753 if (ns->params.nsid <= min_nsid) {
5754 continue;
5755 }
5756 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5757 if (j == data_len / sizeof(uint32_t)) {
5758 break;
5759 }
5760 }
5761
5762 return nvme_c2h(n, list, data_len, req);
5763 }
5764
nvme_identify_nslist_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5765 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
5766 bool active)
5767 {
5768 NvmeNamespace *ns;
5769 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5770 uint32_t min_nsid = le32_to_cpu(c->nsid);
5771 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5772 static const int data_len = sizeof(list);
5773 uint32_t *list_ptr = (uint32_t *)list;
5774 int i, j = 0;
5775
5776 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
5777
5778 /*
5779 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
5780 */
5781 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5782 return NVME_INVALID_NSID | NVME_DNR;
5783 }
5784
5785 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
5786 return NVME_INVALID_FIELD | NVME_DNR;
5787 }
5788
5789 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5790 ns = nvme_ns(n, i);
5791 if (!ns) {
5792 if (!active) {
5793 ns = nvme_subsys_ns(n->subsys, i);
5794 if (!ns) {
5795 continue;
5796 }
5797 } else {
5798 continue;
5799 }
5800 }
5801 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
5802 continue;
5803 }
5804 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5805 if (j == data_len / sizeof(uint32_t)) {
5806 break;
5807 }
5808 }
5809
5810 return nvme_c2h(n, list, data_len, req);
5811 }
5812
nvme_endurance_group_list(NvmeCtrl * n,NvmeRequest * req)5813 static uint16_t nvme_endurance_group_list(NvmeCtrl *n, NvmeRequest *req)
5814 {
5815 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5816 uint16_t *nr_ids = &list[0];
5817 uint16_t *ids = &list[1];
5818 uint16_t endgid = le32_to_cpu(req->cmd.cdw11) & 0xffff;
5819
5820 /*
5821 * The current nvme-subsys only supports Endurance Group #1.
5822 */
5823 if (!endgid) {
5824 *nr_ids = 1;
5825 ids[0] = 1;
5826 } else {
5827 *nr_ids = 0;
5828 }
5829
5830 return nvme_c2h(n, list, sizeof(list), req);
5831 }
5832
nvme_identify_ns_descr_list(NvmeCtrl * n,NvmeRequest * req)5833 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
5834 {
5835 NvmeNamespace *ns;
5836 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5837 uint32_t nsid = le32_to_cpu(c->nsid);
5838 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5839 uint8_t *pos = list;
5840 struct {
5841 NvmeIdNsDescr hdr;
5842 uint8_t v[NVME_NIDL_UUID];
5843 } QEMU_PACKED uuid = {};
5844 struct {
5845 NvmeIdNsDescr hdr;
5846 uint8_t v[NVME_NIDL_NGUID];
5847 } QEMU_PACKED nguid = {};
5848 struct {
5849 NvmeIdNsDescr hdr;
5850 uint64_t v;
5851 } QEMU_PACKED eui64 = {};
5852 struct {
5853 NvmeIdNsDescr hdr;
5854 uint8_t v;
5855 } QEMU_PACKED csi = {};
5856
5857 trace_pci_nvme_identify_ns_descr_list(nsid);
5858
5859 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5860 return NVME_INVALID_NSID | NVME_DNR;
5861 }
5862
5863 ns = nvme_ns(n, nsid);
5864 if (unlikely(!ns)) {
5865 return NVME_INVALID_FIELD | NVME_DNR;
5866 }
5867
5868 if (!qemu_uuid_is_null(&ns->params.uuid)) {
5869 uuid.hdr.nidt = NVME_NIDT_UUID;
5870 uuid.hdr.nidl = NVME_NIDL_UUID;
5871 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
5872 memcpy(pos, &uuid, sizeof(uuid));
5873 pos += sizeof(uuid);
5874 }
5875
5876 if (!nvme_nguid_is_null(&ns->params.nguid)) {
5877 nguid.hdr.nidt = NVME_NIDT_NGUID;
5878 nguid.hdr.nidl = NVME_NIDL_NGUID;
5879 memcpy(nguid.v, ns->params.nguid.data, NVME_NIDL_NGUID);
5880 memcpy(pos, &nguid, sizeof(nguid));
5881 pos += sizeof(nguid);
5882 }
5883
5884 if (ns->params.eui64) {
5885 eui64.hdr.nidt = NVME_NIDT_EUI64;
5886 eui64.hdr.nidl = NVME_NIDL_EUI64;
5887 eui64.v = cpu_to_be64(ns->params.eui64);
5888 memcpy(pos, &eui64, sizeof(eui64));
5889 pos += sizeof(eui64);
5890 }
5891
5892 csi.hdr.nidt = NVME_NIDT_CSI;
5893 csi.hdr.nidl = NVME_NIDL_CSI;
5894 csi.v = ns->csi;
5895 memcpy(pos, &csi, sizeof(csi));
5896 pos += sizeof(csi);
5897
5898 return nvme_c2h(n, list, sizeof(list), req);
5899 }
5900
nvme_identify_cmd_set(NvmeCtrl * n,NvmeRequest * req)5901 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
5902 {
5903 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5904 static const int data_len = sizeof(list);
5905
5906 trace_pci_nvme_identify_cmd_set();
5907
5908 NVME_SET_CSI(*list, NVME_CSI_NVM);
5909 NVME_SET_CSI(*list, NVME_CSI_ZONED);
5910
5911 return nvme_c2h(n, list, data_len, req);
5912 }
5913
nvme_identify(NvmeCtrl * n,NvmeRequest * req)5914 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
5915 {
5916 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5917
5918 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
5919 c->csi);
5920
5921 switch (c->cns) {
5922 case NVME_ID_CNS_NS:
5923 return nvme_identify_ns(n, req, true);
5924 case NVME_ID_CNS_NS_PRESENT:
5925 return nvme_identify_ns(n, req, false);
5926 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
5927 return nvme_identify_ctrl_list(n, req, true);
5928 case NVME_ID_CNS_CTRL_LIST:
5929 return nvme_identify_ctrl_list(n, req, false);
5930 case NVME_ID_CNS_PRIMARY_CTRL_CAP:
5931 return nvme_identify_pri_ctrl_cap(n, req);
5932 case NVME_ID_CNS_SECONDARY_CTRL_LIST:
5933 return nvme_identify_sec_ctrl_list(n, req);
5934 case NVME_ID_CNS_CS_NS:
5935 return nvme_identify_ns_csi(n, req, true);
5936 case NVME_ID_CNS_CS_NS_PRESENT:
5937 return nvme_identify_ns_csi(n, req, false);
5938 case NVME_ID_CNS_CTRL:
5939 return nvme_identify_ctrl(n, req);
5940 case NVME_ID_CNS_CS_CTRL:
5941 return nvme_identify_ctrl_csi(n, req);
5942 case NVME_ID_CNS_NS_ACTIVE_LIST:
5943 return nvme_identify_nslist(n, req, true);
5944 case NVME_ID_CNS_NS_PRESENT_LIST:
5945 return nvme_identify_nslist(n, req, false);
5946 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
5947 return nvme_identify_nslist_csi(n, req, true);
5948 case NVME_ID_CNS_ENDURANCE_GROUP_LIST:
5949 return nvme_endurance_group_list(n, req);
5950 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
5951 return nvme_identify_nslist_csi(n, req, false);
5952 case NVME_ID_CNS_NS_DESCR_LIST:
5953 return nvme_identify_ns_descr_list(n, req);
5954 case NVME_ID_CNS_IO_COMMAND_SET:
5955 return nvme_identify_cmd_set(n, req);
5956 default:
5957 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
5958 return NVME_INVALID_FIELD | NVME_DNR;
5959 }
5960 }
5961
nvme_abort(NvmeCtrl * n,NvmeRequest * req)5962 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
5963 {
5964 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
5965 uint16_t cid = (le32_to_cpu(req->cmd.cdw10) >> 16) & 0xffff;
5966 NvmeSQueue *sq = n->sq[sqid];
5967 NvmeRequest *r, *next;
5968 int i;
5969
5970 req->cqe.result = 1;
5971 if (nvme_check_sqid(n, sqid)) {
5972 return NVME_INVALID_FIELD | NVME_DNR;
5973 }
5974
5975 if (sqid == 0) {
5976 for (i = 0; i < n->outstanding_aers; i++) {
5977 NvmeRequest *re = n->aer_reqs[i];
5978 if (re->cqe.cid == cid) {
5979 memmove(n->aer_reqs + i, n->aer_reqs + i + 1,
5980 (n->outstanding_aers - i - 1) * sizeof(NvmeRequest *));
5981 n->outstanding_aers--;
5982 re->status = NVME_CMD_ABORT_REQ;
5983 req->cqe.result = 0;
5984 nvme_enqueue_req_completion(&n->admin_cq, re);
5985 return NVME_SUCCESS;
5986 }
5987 }
5988 }
5989
5990 QTAILQ_FOREACH_SAFE(r, &sq->out_req_list, entry, next) {
5991 if (r->cqe.cid == cid) {
5992 if (r->aiocb) {
5993 blk_aio_cancel_async(r->aiocb);
5994 }
5995 break;
5996 }
5997 }
5998
5999 return NVME_SUCCESS;
6000 }
6001
nvme_set_timestamp(NvmeCtrl * n,uint64_t ts)6002 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
6003 {
6004 trace_pci_nvme_setfeat_timestamp(ts);
6005
6006 n->host_timestamp = le64_to_cpu(ts);
6007 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6008 }
6009
nvme_get_timestamp(const NvmeCtrl * n)6010 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
6011 {
6012 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6013 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
6014
6015 union nvme_timestamp {
6016 struct {
6017 uint64_t timestamp:48;
6018 uint64_t sync:1;
6019 uint64_t origin:3;
6020 uint64_t rsvd1:12;
6021 };
6022 uint64_t all;
6023 };
6024
6025 union nvme_timestamp ts;
6026 ts.all = 0;
6027 ts.timestamp = n->host_timestamp + elapsed_time;
6028
6029 /* If the host timestamp is non-zero, set the timestamp origin */
6030 ts.origin = n->host_timestamp ? 0x01 : 0x00;
6031
6032 trace_pci_nvme_getfeat_timestamp(ts.all);
6033
6034 return cpu_to_le64(ts.all);
6035 }
6036
nvme_get_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6037 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6038 {
6039 uint64_t timestamp = nvme_get_timestamp(n);
6040
6041 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req);
6042 }
6043
nvme_get_feature_fdp(NvmeCtrl * n,uint32_t endgrpid,uint32_t * result)6044 static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid,
6045 uint32_t *result)
6046 {
6047 *result = 0;
6048
6049 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6050 return NVME_INVALID_FIELD | NVME_DNR;
6051 }
6052
6053 *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1);
6054 *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0);
6055
6056 return NVME_SUCCESS;
6057 }
6058
nvme_get_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req,uint32_t * result)6059 static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6060 NvmeRequest *req, uint32_t *result)
6061 {
6062 NvmeCmd *cmd = &req->cmd;
6063 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6064 uint16_t ph = cdw11 & 0xffff;
6065 uint8_t noet = (cdw11 >> 16) & 0xff;
6066 uint16_t ruhid, ret;
6067 uint32_t nentries = 0;
6068 uint8_t s_events_ndx = 0;
6069 size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet;
6070 g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz);
6071 NvmeRuHandle *ruh;
6072 NvmeFdpEventDescr *s_event;
6073
6074 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6075 return NVME_FDP_DISABLED | NVME_DNR;
6076 }
6077
6078 if (!nvme_ph_valid(ns, ph)) {
6079 return NVME_INVALID_FIELD | NVME_DNR;
6080 }
6081
6082 ruhid = ns->fdp.phs[ph];
6083 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6084
6085 assert(ruh);
6086
6087 if (unlikely(noet == 0)) {
6088 return NVME_INVALID_FIELD | NVME_DNR;
6089 }
6090
6091 for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) {
6092 uint8_t shift = nvme_fdp_evf_shifts[event_type];
6093 if (!shift && event_type) {
6094 /*
6095 * only first entry (event_type == 0) has a shift value of 0
6096 * other entries are simply unpopulated.
6097 */
6098 continue;
6099 }
6100
6101 nentries++;
6102
6103 s_event = &s_events[s_events_ndx];
6104 s_event->evt = event_type;
6105 s_event->evta = (ruh->event_filter >> shift) & 0x1;
6106
6107 /* break if all `noet` entries are filled */
6108 if ((++s_events_ndx) == noet) {
6109 break;
6110 }
6111 }
6112
6113 ret = nvme_c2h(n, s_events, s_events_siz, req);
6114 if (ret) {
6115 return ret;
6116 }
6117
6118 *result = nentries;
6119 return NVME_SUCCESS;
6120 }
6121
nvme_get_feature(NvmeCtrl * n,NvmeRequest * req)6122 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
6123 {
6124 NvmeCmd *cmd = &req->cmd;
6125 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6126 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6127 uint32_t nsid = le32_to_cpu(cmd->nsid);
6128 uint32_t result = 0;
6129 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6130 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
6131 uint16_t iv;
6132 NvmeNamespace *ns;
6133 int i;
6134 uint16_t endgrpid = 0, ret = NVME_SUCCESS;
6135
6136 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
6137 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
6138 };
6139
6140 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
6141
6142 if (!nvme_feature_support[fid]) {
6143 return NVME_INVALID_FIELD | NVME_DNR;
6144 }
6145
6146 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6147 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
6148 /*
6149 * The Reservation Notification Mask and Reservation Persistence
6150 * features require a status code of Invalid Field in Command when
6151 * NSID is FFFFFFFFh. Since the device does not support those
6152 * features we can always return Invalid Namespace or Format as we
6153 * should do for all other features.
6154 */
6155 return NVME_INVALID_NSID | NVME_DNR;
6156 }
6157
6158 if (!nvme_ns(n, nsid)) {
6159 return NVME_INVALID_FIELD | NVME_DNR;
6160 }
6161 }
6162
6163 switch (sel) {
6164 case NVME_GETFEAT_SELECT_CURRENT:
6165 break;
6166 case NVME_GETFEAT_SELECT_SAVED:
6167 /* no features are saveable by the controller; fallthrough */
6168 case NVME_GETFEAT_SELECT_DEFAULT:
6169 goto defaults;
6170 case NVME_GETFEAT_SELECT_CAP:
6171 result = nvme_feature_cap[fid];
6172 goto out;
6173 }
6174
6175 switch (fid) {
6176 case NVME_TEMPERATURE_THRESHOLD:
6177 result = 0;
6178
6179 /*
6180 * The controller only implements the Composite Temperature sensor, so
6181 * return 0 for all other sensors.
6182 */
6183 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6184 goto out;
6185 }
6186
6187 switch (NVME_TEMP_THSEL(dw11)) {
6188 case NVME_TEMP_THSEL_OVER:
6189 result = n->features.temp_thresh_hi;
6190 goto out;
6191 case NVME_TEMP_THSEL_UNDER:
6192 result = n->features.temp_thresh_low;
6193 goto out;
6194 }
6195
6196 return NVME_INVALID_FIELD | NVME_DNR;
6197 case NVME_ERROR_RECOVERY:
6198 if (!nvme_nsid_valid(n, nsid)) {
6199 return NVME_INVALID_NSID | NVME_DNR;
6200 }
6201
6202 ns = nvme_ns(n, nsid);
6203 if (unlikely(!ns)) {
6204 return NVME_INVALID_FIELD | NVME_DNR;
6205 }
6206
6207 result = ns->features.err_rec;
6208 goto out;
6209 case NVME_VOLATILE_WRITE_CACHE:
6210 result = 0;
6211 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6212 ns = nvme_ns(n, i);
6213 if (!ns) {
6214 continue;
6215 }
6216
6217 result = blk_enable_write_cache(ns->blkconf.blk);
6218 if (result) {
6219 break;
6220 }
6221 }
6222 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
6223 goto out;
6224 case NVME_ASYNCHRONOUS_EVENT_CONF:
6225 result = n->features.async_config;
6226 goto out;
6227 case NVME_TIMESTAMP:
6228 return nvme_get_feature_timestamp(n, req);
6229 case NVME_HOST_BEHAVIOR_SUPPORT:
6230 return nvme_c2h(n, (uint8_t *)&n->features.hbs,
6231 sizeof(n->features.hbs), req);
6232 case NVME_FDP_MODE:
6233 endgrpid = dw11 & 0xff;
6234
6235 if (endgrpid != 0x1) {
6236 return NVME_INVALID_FIELD | NVME_DNR;
6237 }
6238
6239 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6240 if (ret) {
6241 return ret;
6242 }
6243 goto out;
6244 case NVME_FDP_EVENTS:
6245 if (!nvme_nsid_valid(n, nsid)) {
6246 return NVME_INVALID_NSID | NVME_DNR;
6247 }
6248
6249 ns = nvme_ns(n, nsid);
6250 if (unlikely(!ns)) {
6251 return NVME_INVALID_FIELD | NVME_DNR;
6252 }
6253
6254 ret = nvme_get_feature_fdp_events(n, ns, req, &result);
6255 if (ret) {
6256 return ret;
6257 }
6258 goto out;
6259 default:
6260 break;
6261 }
6262
6263 defaults:
6264 switch (fid) {
6265 case NVME_TEMPERATURE_THRESHOLD:
6266 result = 0;
6267
6268 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6269 break;
6270 }
6271
6272 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
6273 result = NVME_TEMPERATURE_WARNING;
6274 }
6275
6276 break;
6277 case NVME_NUMBER_OF_QUEUES:
6278 result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
6279 trace_pci_nvme_getfeat_numq(result);
6280 break;
6281 case NVME_INTERRUPT_VECTOR_CONF:
6282 iv = dw11 & 0xffff;
6283 if (iv >= n->conf_ioqpairs + 1) {
6284 return NVME_INVALID_FIELD | NVME_DNR;
6285 }
6286
6287 result = iv;
6288 if (iv == n->admin_cq.vector) {
6289 result |= NVME_INTVC_NOCOALESCING;
6290 }
6291 break;
6292 case NVME_FDP_MODE:
6293 endgrpid = dw11 & 0xff;
6294
6295 if (endgrpid != 0x1) {
6296 return NVME_INVALID_FIELD | NVME_DNR;
6297 }
6298
6299 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6300 if (ret) {
6301 return ret;
6302 }
6303 goto out;
6304
6305 break;
6306 default:
6307 result = nvme_feature_default[fid];
6308 break;
6309 }
6310
6311 out:
6312 req->cqe.result = cpu_to_le32(result);
6313 return ret;
6314 }
6315
nvme_set_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6316 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6317 {
6318 uint16_t ret;
6319 uint64_t timestamp;
6320
6321 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req);
6322 if (ret) {
6323 return ret;
6324 }
6325
6326 nvme_set_timestamp(n, timestamp);
6327
6328 return NVME_SUCCESS;
6329 }
6330
nvme_set_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req)6331 static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6332 NvmeRequest *req)
6333 {
6334 NvmeCmd *cmd = &req->cmd;
6335 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6336 uint16_t ph = cdw11 & 0xffff;
6337 uint8_t noet = (cdw11 >> 16) & 0xff;
6338 uint16_t ret, ruhid;
6339 uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1;
6340 uint8_t event_mask = 0;
6341 unsigned int i;
6342 g_autofree uint8_t *events = g_malloc0(noet);
6343 NvmeRuHandle *ruh = NULL;
6344
6345 assert(ns);
6346
6347 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6348 return NVME_FDP_DISABLED | NVME_DNR;
6349 }
6350
6351 if (!nvme_ph_valid(ns, ph)) {
6352 return NVME_INVALID_FIELD | NVME_DNR;
6353 }
6354
6355 ruhid = ns->fdp.phs[ph];
6356 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6357
6358 ret = nvme_h2c(n, events, noet, req);
6359 if (ret) {
6360 return ret;
6361 }
6362
6363 for (i = 0; i < noet; i++) {
6364 event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]);
6365 }
6366
6367 if (enable) {
6368 ruh->event_filter |= event_mask;
6369 } else {
6370 ruh->event_filter = ruh->event_filter & ~event_mask;
6371 }
6372
6373 return NVME_SUCCESS;
6374 }
6375
nvme_set_feature(NvmeCtrl * n,NvmeRequest * req)6376 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
6377 {
6378 NvmeNamespace *ns = NULL;
6379
6380 NvmeCmd *cmd = &req->cmd;
6381 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6382 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6383 uint32_t nsid = le32_to_cpu(cmd->nsid);
6384 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6385 uint8_t save = NVME_SETFEAT_SAVE(dw10);
6386 uint16_t status;
6387 int i;
6388
6389 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
6390
6391 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
6392 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
6393 }
6394
6395 if (!nvme_feature_support[fid]) {
6396 return NVME_INVALID_FIELD | NVME_DNR;
6397 }
6398
6399 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6400 if (nsid != NVME_NSID_BROADCAST) {
6401 if (!nvme_nsid_valid(n, nsid)) {
6402 return NVME_INVALID_NSID | NVME_DNR;
6403 }
6404
6405 ns = nvme_ns(n, nsid);
6406 if (unlikely(!ns)) {
6407 return NVME_INVALID_FIELD | NVME_DNR;
6408 }
6409 }
6410 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
6411 if (!nvme_nsid_valid(n, nsid)) {
6412 return NVME_INVALID_NSID | NVME_DNR;
6413 }
6414
6415 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
6416 }
6417
6418 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
6419 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6420 }
6421
6422 switch (fid) {
6423 case NVME_TEMPERATURE_THRESHOLD:
6424 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6425 break;
6426 }
6427
6428 switch (NVME_TEMP_THSEL(dw11)) {
6429 case NVME_TEMP_THSEL_OVER:
6430 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
6431 break;
6432 case NVME_TEMP_THSEL_UNDER:
6433 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
6434 break;
6435 default:
6436 return NVME_INVALID_FIELD | NVME_DNR;
6437 }
6438
6439 if ((n->temperature >= n->features.temp_thresh_hi) ||
6440 (n->temperature <= n->features.temp_thresh_low)) {
6441 nvme_smart_event(n, NVME_SMART_TEMPERATURE);
6442 }
6443
6444 break;
6445 case NVME_ERROR_RECOVERY:
6446 if (nsid == NVME_NSID_BROADCAST) {
6447 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6448 ns = nvme_ns(n, i);
6449
6450 if (!ns) {
6451 continue;
6452 }
6453
6454 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6455 ns->features.err_rec = dw11;
6456 }
6457 }
6458
6459 break;
6460 }
6461
6462 assert(ns);
6463 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6464 ns->features.err_rec = dw11;
6465 }
6466 break;
6467 case NVME_VOLATILE_WRITE_CACHE:
6468 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6469 ns = nvme_ns(n, i);
6470 if (!ns) {
6471 continue;
6472 }
6473
6474 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
6475 blk_flush(ns->blkconf.blk);
6476 }
6477
6478 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
6479 }
6480
6481 break;
6482
6483 case NVME_NUMBER_OF_QUEUES:
6484 if (n->qs_created) {
6485 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6486 }
6487
6488 /*
6489 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
6490 * and NSQR.
6491 */
6492 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
6493 return NVME_INVALID_FIELD | NVME_DNR;
6494 }
6495
6496 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
6497 ((dw11 >> 16) & 0xffff) + 1,
6498 n->conf_ioqpairs,
6499 n->conf_ioqpairs);
6500 req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
6501 ((n->conf_ioqpairs - 1) << 16));
6502 break;
6503 case NVME_ASYNCHRONOUS_EVENT_CONF:
6504 n->features.async_config = dw11;
6505 break;
6506 case NVME_TIMESTAMP:
6507 return nvme_set_feature_timestamp(n, req);
6508 case NVME_HOST_BEHAVIOR_SUPPORT:
6509 status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
6510 sizeof(n->features.hbs), req);
6511 if (status) {
6512 return status;
6513 }
6514
6515 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6516 ns = nvme_ns(n, i);
6517
6518 if (!ns) {
6519 continue;
6520 }
6521
6522 ns->id_ns.nlbaf = ns->nlbaf - 1;
6523 if (!n->features.hbs.lbafee) {
6524 ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
6525 }
6526 }
6527
6528 return status;
6529 case NVME_COMMAND_SET_PROFILE:
6530 if (dw11 & 0x1ff) {
6531 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
6532 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
6533 }
6534 break;
6535 case NVME_FDP_MODE:
6536 /* spec: abort with cmd seq err if there's one or more NS' in endgrp */
6537 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6538 case NVME_FDP_EVENTS:
6539 return nvme_set_feature_fdp_events(n, ns, req);
6540 default:
6541 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6542 }
6543 return NVME_SUCCESS;
6544 }
6545
nvme_aer(NvmeCtrl * n,NvmeRequest * req)6546 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
6547 {
6548 trace_pci_nvme_aer(nvme_cid(req));
6549
6550 if (n->outstanding_aers > n->params.aerl) {
6551 trace_pci_nvme_aer_aerl_exceeded();
6552 return NVME_AER_LIMIT_EXCEEDED;
6553 }
6554
6555 n->aer_reqs[n->outstanding_aers] = req;
6556 n->outstanding_aers++;
6557
6558 if (!QTAILQ_EMPTY(&n->aer_queue)) {
6559 nvme_process_aers(n);
6560 }
6561
6562 return NVME_NO_COMPLETE;
6563 }
6564
nvme_update_dmrsl(NvmeCtrl * n)6565 static void nvme_update_dmrsl(NvmeCtrl *n)
6566 {
6567 int nsid;
6568
6569 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
6570 NvmeNamespace *ns = nvme_ns(n, nsid);
6571 if (!ns) {
6572 continue;
6573 }
6574
6575 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6576 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6577 }
6578 }
6579
nvme_select_iocs_ns(NvmeCtrl * n,NvmeNamespace * ns)6580 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
6581 {
6582 uint32_t cc = ldl_le_p(&n->bar.cc);
6583
6584 ns->iocs = nvme_cse_iocs_none;
6585 switch (ns->csi) {
6586 case NVME_CSI_NVM:
6587 if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
6588 ns->iocs = nvme_cse_iocs_nvm;
6589 }
6590 break;
6591 case NVME_CSI_ZONED:
6592 if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
6593 ns->iocs = nvme_cse_iocs_zoned;
6594 } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
6595 ns->iocs = nvme_cse_iocs_nvm;
6596 }
6597 break;
6598 }
6599 }
6600
nvme_ns_attachment(NvmeCtrl * n,NvmeRequest * req)6601 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
6602 {
6603 NvmeNamespace *ns;
6604 NvmeCtrl *ctrl;
6605 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
6606 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6607 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6608 uint8_t sel = dw10 & 0xf;
6609 uint16_t *nr_ids = &list[0];
6610 uint16_t *ids = &list[1];
6611 uint16_t ret;
6612 int i;
6613
6614 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
6615
6616 if (!nvme_nsid_valid(n, nsid)) {
6617 return NVME_INVALID_NSID | NVME_DNR;
6618 }
6619
6620 ns = nvme_subsys_ns(n->subsys, nsid);
6621 if (!ns) {
6622 return NVME_INVALID_FIELD | NVME_DNR;
6623 }
6624
6625 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
6626 if (ret) {
6627 return ret;
6628 }
6629
6630 if (!*nr_ids) {
6631 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6632 }
6633
6634 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
6635 for (i = 0; i < *nr_ids; i++) {
6636 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
6637 if (!ctrl) {
6638 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6639 }
6640
6641 switch (sel) {
6642 case NVME_NS_ATTACHMENT_ATTACH:
6643 if (nvme_ns(ctrl, nsid)) {
6644 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
6645 }
6646
6647 if (ns->attached && !ns->params.shared) {
6648 return NVME_NS_PRIVATE | NVME_DNR;
6649 }
6650
6651 nvme_attach_ns(ctrl, ns);
6652 nvme_select_iocs_ns(ctrl, ns);
6653
6654 break;
6655
6656 case NVME_NS_ATTACHMENT_DETACH:
6657 if (!nvme_ns(ctrl, nsid)) {
6658 return NVME_NS_NOT_ATTACHED | NVME_DNR;
6659 }
6660
6661 ctrl->namespaces[nsid] = NULL;
6662 ns->attached--;
6663
6664 nvme_update_dmrsl(ctrl);
6665
6666 break;
6667
6668 default:
6669 return NVME_INVALID_FIELD | NVME_DNR;
6670 }
6671
6672 /*
6673 * Add namespace id to the changed namespace id list for event clearing
6674 * via Get Log Page command.
6675 */
6676 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
6677 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
6678 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
6679 NVME_LOG_CHANGED_NSLIST);
6680 }
6681 }
6682
6683 return NVME_SUCCESS;
6684 }
6685
6686 typedef struct NvmeFormatAIOCB {
6687 BlockAIOCB common;
6688 BlockAIOCB *aiocb;
6689 NvmeRequest *req;
6690 int ret;
6691
6692 NvmeNamespace *ns;
6693 uint32_t nsid;
6694 bool broadcast;
6695 int64_t offset;
6696
6697 uint8_t lbaf;
6698 uint8_t mset;
6699 uint8_t pi;
6700 uint8_t pil;
6701 } NvmeFormatAIOCB;
6702
nvme_format_cancel(BlockAIOCB * aiocb)6703 static void nvme_format_cancel(BlockAIOCB *aiocb)
6704 {
6705 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
6706
6707 iocb->ret = -ECANCELED;
6708
6709 if (iocb->aiocb) {
6710 blk_aio_cancel_async(iocb->aiocb);
6711 iocb->aiocb = NULL;
6712 }
6713 }
6714
6715 static const AIOCBInfo nvme_format_aiocb_info = {
6716 .aiocb_size = sizeof(NvmeFormatAIOCB),
6717 .cancel_async = nvme_format_cancel,
6718 };
6719
nvme_format_set(NvmeNamespace * ns,uint8_t lbaf,uint8_t mset,uint8_t pi,uint8_t pil)6720 static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
6721 uint8_t pi, uint8_t pil)
6722 {
6723 uint8_t lbafl = lbaf & 0xf;
6724 uint8_t lbafu = lbaf >> 4;
6725
6726 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
6727
6728 ns->id_ns.dps = (pil << 3) | pi;
6729 ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
6730
6731 nvme_ns_init_format(ns);
6732 }
6733
6734 static void nvme_do_format(NvmeFormatAIOCB *iocb);
6735
nvme_format_ns_cb(void * opaque,int ret)6736 static void nvme_format_ns_cb(void *opaque, int ret)
6737 {
6738 NvmeFormatAIOCB *iocb = opaque;
6739 NvmeNamespace *ns = iocb->ns;
6740 int bytes;
6741
6742 if (iocb->ret < 0) {
6743 goto done;
6744 } else if (ret < 0) {
6745 iocb->ret = ret;
6746 goto done;
6747 }
6748
6749 assert(ns);
6750
6751 if (iocb->offset < ns->size) {
6752 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
6753
6754 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
6755 bytes, BDRV_REQ_MAY_UNMAP,
6756 nvme_format_ns_cb, iocb);
6757
6758 iocb->offset += bytes;
6759 return;
6760 }
6761
6762 nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
6763 ns->status = 0x0;
6764 iocb->ns = NULL;
6765 iocb->offset = 0;
6766
6767 done:
6768 nvme_do_format(iocb);
6769 }
6770
nvme_format_check(NvmeNamespace * ns,uint8_t lbaf,uint8_t pi)6771 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
6772 {
6773 if (ns->params.zoned) {
6774 return NVME_INVALID_FORMAT | NVME_DNR;
6775 }
6776
6777 if (lbaf > ns->id_ns.nlbaf) {
6778 return NVME_INVALID_FORMAT | NVME_DNR;
6779 }
6780
6781 if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
6782 return NVME_INVALID_FORMAT | NVME_DNR;
6783 }
6784
6785 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
6786 return NVME_INVALID_FIELD | NVME_DNR;
6787 }
6788
6789 return NVME_SUCCESS;
6790 }
6791
nvme_do_format(NvmeFormatAIOCB * iocb)6792 static void nvme_do_format(NvmeFormatAIOCB *iocb)
6793 {
6794 NvmeRequest *req = iocb->req;
6795 NvmeCtrl *n = nvme_ctrl(req);
6796 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6797 uint8_t lbaf = dw10 & 0xf;
6798 uint8_t pi = (dw10 >> 5) & 0x7;
6799 uint16_t status;
6800 int i;
6801
6802 if (iocb->ret < 0) {
6803 goto done;
6804 }
6805
6806 if (iocb->broadcast) {
6807 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
6808 iocb->ns = nvme_ns(n, i);
6809 if (iocb->ns) {
6810 iocb->nsid = i;
6811 break;
6812 }
6813 }
6814 }
6815
6816 if (!iocb->ns) {
6817 goto done;
6818 }
6819
6820 status = nvme_format_check(iocb->ns, lbaf, pi);
6821 if (status) {
6822 req->status = status;
6823 goto done;
6824 }
6825
6826 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
6827 nvme_format_ns_cb(iocb, 0);
6828 return;
6829
6830 done:
6831 iocb->common.cb(iocb->common.opaque, iocb->ret);
6832 qemu_aio_unref(iocb);
6833 }
6834
nvme_format(NvmeCtrl * n,NvmeRequest * req)6835 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
6836 {
6837 NvmeFormatAIOCB *iocb;
6838 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6839 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6840 uint8_t lbaf = dw10 & 0xf;
6841 uint8_t mset = (dw10 >> 4) & 0x1;
6842 uint8_t pi = (dw10 >> 5) & 0x7;
6843 uint8_t pil = (dw10 >> 8) & 0x1;
6844 uint8_t lbafu = (dw10 >> 12) & 0x3;
6845 uint16_t status;
6846
6847 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
6848
6849 iocb->req = req;
6850 iocb->ret = 0;
6851 iocb->ns = NULL;
6852 iocb->nsid = 0;
6853 iocb->lbaf = lbaf;
6854 iocb->mset = mset;
6855 iocb->pi = pi;
6856 iocb->pil = pil;
6857 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
6858 iocb->offset = 0;
6859
6860 if (n->features.hbs.lbafee) {
6861 iocb->lbaf |= lbafu << 4;
6862 }
6863
6864 if (!iocb->broadcast) {
6865 if (!nvme_nsid_valid(n, nsid)) {
6866 status = NVME_INVALID_NSID | NVME_DNR;
6867 goto out;
6868 }
6869
6870 iocb->ns = nvme_ns(n, nsid);
6871 if (!iocb->ns) {
6872 status = NVME_INVALID_FIELD | NVME_DNR;
6873 goto out;
6874 }
6875 }
6876
6877 req->aiocb = &iocb->common;
6878 nvme_do_format(iocb);
6879
6880 return NVME_NO_COMPLETE;
6881
6882 out:
6883 qemu_aio_unref(iocb);
6884
6885 return status;
6886 }
6887
nvme_get_virt_res_num(NvmeCtrl * n,uint8_t rt,int * num_total,int * num_prim,int * num_sec)6888 static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
6889 int *num_prim, int *num_sec)
6890 {
6891 *num_total = le32_to_cpu(rt ?
6892 n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt);
6893 *num_prim = le16_to_cpu(rt ?
6894 n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap);
6895 *num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
6896 }
6897
nvme_assign_virt_res_to_prim(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)6898 static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
6899 uint16_t cntlid, uint8_t rt,
6900 int nr)
6901 {
6902 int num_total, num_prim, num_sec;
6903
6904 if (cntlid != n->cntlid) {
6905 return NVME_INVALID_CTRL_ID | NVME_DNR;
6906 }
6907
6908 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
6909
6910 if (nr > num_total) {
6911 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
6912 }
6913
6914 if (nr > num_total - num_sec) {
6915 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6916 }
6917
6918 if (rt) {
6919 n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
6920 } else {
6921 n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
6922 }
6923
6924 req->cqe.result = cpu_to_le32(nr);
6925 return req->status;
6926 }
6927
nvme_update_virt_res(NvmeCtrl * n,NvmeSecCtrlEntry * sctrl,uint8_t rt,int nr)6928 static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
6929 uint8_t rt, int nr)
6930 {
6931 int prev_nr, prev_total;
6932
6933 if (rt) {
6934 prev_nr = le16_to_cpu(sctrl->nvi);
6935 prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
6936 sctrl->nvi = cpu_to_le16(nr);
6937 n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
6938 } else {
6939 prev_nr = le16_to_cpu(sctrl->nvq);
6940 prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
6941 sctrl->nvq = cpu_to_le16(nr);
6942 n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
6943 }
6944 }
6945
nvme_assign_virt_res_to_sec(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)6946 static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
6947 uint16_t cntlid, uint8_t rt, int nr)
6948 {
6949 int num_total, num_prim, num_sec, num_free, diff, limit;
6950 NvmeSecCtrlEntry *sctrl;
6951
6952 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
6953 if (!sctrl) {
6954 return NVME_INVALID_CTRL_ID | NVME_DNR;
6955 }
6956
6957 if (sctrl->scs) {
6958 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
6959 }
6960
6961 limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
6962 if (nr > limit) {
6963 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
6964 }
6965
6966 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
6967 num_free = num_total - num_prim - num_sec;
6968 diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
6969
6970 if (diff > num_free) {
6971 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6972 }
6973
6974 nvme_update_virt_res(n, sctrl, rt, nr);
6975 req->cqe.result = cpu_to_le32(nr);
6976
6977 return req->status;
6978 }
6979
nvme_virt_set_state(NvmeCtrl * n,uint16_t cntlid,bool online)6980 static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
6981 {
6982 PCIDevice *pci = PCI_DEVICE(n);
6983 NvmeCtrl *sn = NULL;
6984 NvmeSecCtrlEntry *sctrl;
6985 int vf_index;
6986
6987 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
6988 if (!sctrl) {
6989 return NVME_INVALID_CTRL_ID | NVME_DNR;
6990 }
6991
6992 if (!pci_is_vf(pci)) {
6993 vf_index = le16_to_cpu(sctrl->vfn) - 1;
6994 sn = NVME(pcie_sriov_get_vf_at_index(pci, vf_index));
6995 }
6996
6997 if (online) {
6998 if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
6999 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
7000 }
7001
7002 if (!sctrl->scs) {
7003 sctrl->scs = 0x1;
7004 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7005 }
7006 } else {
7007 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_INTERRUPT, 0);
7008 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_QUEUE, 0);
7009
7010 if (sctrl->scs) {
7011 sctrl->scs = 0x0;
7012 if (sn) {
7013 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7014 }
7015 }
7016 }
7017
7018 return NVME_SUCCESS;
7019 }
7020
nvme_virt_mngmt(NvmeCtrl * n,NvmeRequest * req)7021 static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
7022 {
7023 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7024 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7025 uint8_t act = dw10 & 0xf;
7026 uint8_t rt = (dw10 >> 8) & 0x7;
7027 uint16_t cntlid = (dw10 >> 16) & 0xffff;
7028 int nr = dw11 & 0xffff;
7029
7030 trace_pci_nvme_virt_mngmt(nvme_cid(req), act, cntlid, rt ? "VI" : "VQ", nr);
7031
7032 if (rt != NVME_VIRT_RES_QUEUE && rt != NVME_VIRT_RES_INTERRUPT) {
7033 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7034 }
7035
7036 switch (act) {
7037 case NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN:
7038 return nvme_assign_virt_res_to_sec(n, req, cntlid, rt, nr);
7039 case NVME_VIRT_MNGMT_ACTION_PRM_ALLOC:
7040 return nvme_assign_virt_res_to_prim(n, req, cntlid, rt, nr);
7041 case NVME_VIRT_MNGMT_ACTION_SEC_ONLINE:
7042 return nvme_virt_set_state(n, cntlid, true);
7043 case NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE:
7044 return nvme_virt_set_state(n, cntlid, false);
7045 default:
7046 return NVME_INVALID_FIELD | NVME_DNR;
7047 }
7048 }
7049
nvme_dbbuf_config(NvmeCtrl * n,const NvmeRequest * req)7050 static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
7051 {
7052 PCIDevice *pci = PCI_DEVICE(n);
7053 uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
7054 uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
7055 int i;
7056
7057 /* Address should be page aligned */
7058 if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
7059 return NVME_INVALID_FIELD | NVME_DNR;
7060 }
7061
7062 /* Save shadow buffer base addr for use during queue creation */
7063 n->dbbuf_dbs = dbs_addr;
7064 n->dbbuf_eis = eis_addr;
7065 n->dbbuf_enabled = true;
7066
7067 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7068 NvmeSQueue *sq = n->sq[i];
7069 NvmeCQueue *cq = n->cq[i];
7070
7071 if (sq) {
7072 /*
7073 * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
7074 * nvme_process_db() uses this hard-coded way to calculate
7075 * doorbell offsets. Be consistent with that here.
7076 */
7077 sq->db_addr = dbs_addr + (i << 3);
7078 sq->ei_addr = eis_addr + (i << 3);
7079 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
7080
7081 if (n->params.ioeventfd && sq->sqid != 0) {
7082 if (!nvme_init_sq_ioeventfd(sq)) {
7083 sq->ioeventfd_enabled = true;
7084 }
7085 }
7086 }
7087
7088 if (cq) {
7089 /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
7090 cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
7091 cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
7092 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
7093
7094 if (n->params.ioeventfd && cq->cqid != 0) {
7095 if (!nvme_init_cq_ioeventfd(cq)) {
7096 cq->ioeventfd_enabled = true;
7097 }
7098 }
7099 }
7100 }
7101
7102 trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr);
7103
7104 return NVME_SUCCESS;
7105 }
7106
nvme_directive_send(NvmeCtrl * n,NvmeRequest * req)7107 static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req)
7108 {
7109 return NVME_INVALID_FIELD | NVME_DNR;
7110 }
7111
nvme_directive_receive(NvmeCtrl * n,NvmeRequest * req)7112 static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
7113 {
7114 NvmeNamespace *ns;
7115 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7116 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7117 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
7118 uint8_t doper, dtype;
7119 uint32_t numd, trans_len;
7120 NvmeDirectiveIdentify id = {
7121 .supported = 1 << NVME_DIRECTIVE_IDENTIFY,
7122 .enabled = 1 << NVME_DIRECTIVE_IDENTIFY,
7123 };
7124
7125 numd = dw10 + 1;
7126 doper = dw11 & 0xff;
7127 dtype = (dw11 >> 8) & 0xff;
7128
7129 trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2);
7130
7131 if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY ||
7132 doper != NVME_DIRECTIVE_RETURN_PARAMS) {
7133 return NVME_INVALID_FIELD | NVME_DNR;
7134 }
7135
7136 ns = nvme_ns(n, nsid);
7137 if (!ns) {
7138 return NVME_INVALID_FIELD | NVME_DNR;
7139 }
7140
7141 switch (dtype) {
7142 case NVME_DIRECTIVE_IDENTIFY:
7143 switch (doper) {
7144 case NVME_DIRECTIVE_RETURN_PARAMS:
7145 if (ns->endgrp && ns->endgrp->fdp.enabled) {
7146 id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7147 id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7148 id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7149 }
7150
7151 return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
7152
7153 default:
7154 return NVME_INVALID_FIELD | NVME_DNR;
7155 }
7156
7157 default:
7158 return NVME_INVALID_FIELD;
7159 }
7160 }
7161
nvme_admin_cmd(NvmeCtrl * n,NvmeRequest * req)7162 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
7163 {
7164 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
7165 nvme_adm_opc_str(req->cmd.opcode));
7166
7167 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
7168 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
7169 return NVME_INVALID_OPCODE | NVME_DNR;
7170 }
7171
7172 /* SGLs shall not be used for Admin commands in NVMe over PCIe */
7173 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
7174 return NVME_INVALID_FIELD | NVME_DNR;
7175 }
7176
7177 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
7178 return NVME_INVALID_FIELD;
7179 }
7180
7181 switch (req->cmd.opcode) {
7182 case NVME_ADM_CMD_DELETE_SQ:
7183 return nvme_del_sq(n, req);
7184 case NVME_ADM_CMD_CREATE_SQ:
7185 return nvme_create_sq(n, req);
7186 case NVME_ADM_CMD_GET_LOG_PAGE:
7187 return nvme_get_log(n, req);
7188 case NVME_ADM_CMD_DELETE_CQ:
7189 return nvme_del_cq(n, req);
7190 case NVME_ADM_CMD_CREATE_CQ:
7191 return nvme_create_cq(n, req);
7192 case NVME_ADM_CMD_IDENTIFY:
7193 return nvme_identify(n, req);
7194 case NVME_ADM_CMD_ABORT:
7195 return nvme_abort(n, req);
7196 case NVME_ADM_CMD_SET_FEATURES:
7197 return nvme_set_feature(n, req);
7198 case NVME_ADM_CMD_GET_FEATURES:
7199 return nvme_get_feature(n, req);
7200 case NVME_ADM_CMD_ASYNC_EV_REQ:
7201 return nvme_aer(n, req);
7202 case NVME_ADM_CMD_NS_ATTACHMENT:
7203 return nvme_ns_attachment(n, req);
7204 case NVME_ADM_CMD_VIRT_MNGMT:
7205 return nvme_virt_mngmt(n, req);
7206 case NVME_ADM_CMD_DBBUF_CONFIG:
7207 return nvme_dbbuf_config(n, req);
7208 case NVME_ADM_CMD_FORMAT_NVM:
7209 return nvme_format(n, req);
7210 case NVME_ADM_CMD_DIRECTIVE_SEND:
7211 return nvme_directive_send(n, req);
7212 case NVME_ADM_CMD_DIRECTIVE_RECV:
7213 return nvme_directive_receive(n, req);
7214 default:
7215 assert(false);
7216 }
7217
7218 return NVME_INVALID_OPCODE | NVME_DNR;
7219 }
7220
nvme_update_sq_eventidx(const NvmeSQueue * sq)7221 static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
7222 {
7223 trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail);
7224
7225 stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail,
7226 MEMTXATTRS_UNSPECIFIED);
7227 }
7228
nvme_update_sq_tail(NvmeSQueue * sq)7229 static void nvme_update_sq_tail(NvmeSQueue *sq)
7230 {
7231 ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, &sq->tail,
7232 MEMTXATTRS_UNSPECIFIED);
7233
7234 trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail);
7235 }
7236
nvme_process_sq(void * opaque)7237 static void nvme_process_sq(void *opaque)
7238 {
7239 NvmeSQueue *sq = opaque;
7240 NvmeCtrl *n = sq->ctrl;
7241 NvmeCQueue *cq = n->cq[sq->cqid];
7242
7243 uint16_t status;
7244 hwaddr addr;
7245 NvmeCmd cmd;
7246 NvmeRequest *req;
7247
7248 if (n->dbbuf_enabled) {
7249 nvme_update_sq_tail(sq);
7250 }
7251
7252 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
7253 addr = sq->dma_addr + (sq->head << NVME_SQES);
7254 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
7255 trace_pci_nvme_err_addr_read(addr);
7256 trace_pci_nvme_err_cfs();
7257 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
7258 break;
7259 }
7260 nvme_inc_sq_head(sq);
7261
7262 req = QTAILQ_FIRST(&sq->req_list);
7263 QTAILQ_REMOVE(&sq->req_list, req, entry);
7264 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
7265 nvme_req_clear(req);
7266 req->cqe.cid = cmd.cid;
7267 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
7268
7269 status = sq->sqid ? nvme_io_cmd(n, req) :
7270 nvme_admin_cmd(n, req);
7271 if (status != NVME_NO_COMPLETE) {
7272 req->status = status;
7273 nvme_enqueue_req_completion(cq, req);
7274 }
7275
7276 if (n->dbbuf_enabled) {
7277 nvme_update_sq_eventidx(sq);
7278 nvme_update_sq_tail(sq);
7279 }
7280 }
7281 }
7282
nvme_update_msixcap_ts(PCIDevice * pci_dev,uint32_t table_size)7283 static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size)
7284 {
7285 uint8_t *config;
7286
7287 if (!msix_present(pci_dev)) {
7288 return;
7289 }
7290
7291 assert(table_size > 0 && table_size <= pci_dev->msix_entries_nr);
7292
7293 config = pci_dev->config + pci_dev->msix_cap;
7294 pci_set_word_by_mask(config + PCI_MSIX_FLAGS, PCI_MSIX_FLAGS_QSIZE,
7295 table_size - 1);
7296 }
7297
nvme_activate_virt_res(NvmeCtrl * n)7298 static void nvme_activate_virt_res(NvmeCtrl *n)
7299 {
7300 PCIDevice *pci_dev = PCI_DEVICE(n);
7301 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7302 NvmeSecCtrlEntry *sctrl;
7303
7304 /* -1 to account for the admin queue */
7305 if (pci_is_vf(pci_dev)) {
7306 sctrl = nvme_sctrl(n);
7307 cap->vqprt = sctrl->nvq;
7308 cap->viprt = sctrl->nvi;
7309 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
7310 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
7311 } else {
7312 cap->vqrfap = n->next_pri_ctrl_cap.vqrfap;
7313 cap->virfap = n->next_pri_ctrl_cap.virfap;
7314 n->conf_ioqpairs = le16_to_cpu(cap->vqprt) +
7315 le16_to_cpu(cap->vqrfap) - 1;
7316 n->conf_msix_qsize = le16_to_cpu(cap->viprt) +
7317 le16_to_cpu(cap->virfap);
7318 }
7319 }
7320
nvme_ctrl_reset(NvmeCtrl * n,NvmeResetType rst)7321 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
7322 {
7323 PCIDevice *pci_dev = PCI_DEVICE(n);
7324 NvmeSecCtrlEntry *sctrl;
7325 NvmeNamespace *ns;
7326 int i;
7327
7328 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7329 ns = nvme_ns(n, i);
7330 if (!ns) {
7331 continue;
7332 }
7333
7334 nvme_ns_drain(ns);
7335 }
7336
7337 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7338 if (n->sq[i] != NULL) {
7339 nvme_free_sq(n->sq[i], n);
7340 }
7341 }
7342 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7343 if (n->cq[i] != NULL) {
7344 nvme_free_cq(n->cq[i], n);
7345 }
7346 }
7347
7348 while (!QTAILQ_EMPTY(&n->aer_queue)) {
7349 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
7350 QTAILQ_REMOVE(&n->aer_queue, event, entry);
7351 g_free(event);
7352 }
7353
7354 if (n->params.sriov_max_vfs) {
7355 if (!pci_is_vf(pci_dev)) {
7356 for (i = 0; i < n->nr_sec_ctrls; i++) {
7357 sctrl = &n->sec_ctrl_list[i];
7358 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
7359 }
7360 }
7361
7362 if (rst != NVME_RESET_CONTROLLER) {
7363 nvme_activate_virt_res(n);
7364 }
7365 }
7366
7367 n->aer_queued = 0;
7368 n->aer_mask = 0;
7369 n->outstanding_aers = 0;
7370 n->qs_created = false;
7371
7372 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
7373
7374 if (pci_is_vf(pci_dev)) {
7375 sctrl = nvme_sctrl(n);
7376
7377 stl_le_p(&n->bar.csts, sctrl->scs ? 0 : NVME_CSTS_FAILED);
7378 } else {
7379 stl_le_p(&n->bar.csts, 0);
7380 }
7381
7382 stl_le_p(&n->bar.intms, 0);
7383 stl_le_p(&n->bar.intmc, 0);
7384 stl_le_p(&n->bar.cc, 0);
7385
7386 n->dbbuf_dbs = 0;
7387 n->dbbuf_eis = 0;
7388 n->dbbuf_enabled = false;
7389 }
7390
nvme_ctrl_shutdown(NvmeCtrl * n)7391 static void nvme_ctrl_shutdown(NvmeCtrl *n)
7392 {
7393 NvmeNamespace *ns;
7394 int i;
7395
7396 if (n->pmr.dev) {
7397 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7398 }
7399
7400 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7401 ns = nvme_ns(n, i);
7402 if (!ns) {
7403 continue;
7404 }
7405
7406 nvme_ns_shutdown(ns);
7407 }
7408 }
7409
nvme_select_iocs(NvmeCtrl * n)7410 static void nvme_select_iocs(NvmeCtrl *n)
7411 {
7412 NvmeNamespace *ns;
7413 int i;
7414
7415 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7416 ns = nvme_ns(n, i);
7417 if (!ns) {
7418 continue;
7419 }
7420
7421 nvme_select_iocs_ns(n, ns);
7422 }
7423 }
7424
nvme_start_ctrl(NvmeCtrl * n)7425 static int nvme_start_ctrl(NvmeCtrl *n)
7426 {
7427 uint64_t cap = ldq_le_p(&n->bar.cap);
7428 uint32_t cc = ldl_le_p(&n->bar.cc);
7429 uint32_t aqa = ldl_le_p(&n->bar.aqa);
7430 uint64_t asq = ldq_le_p(&n->bar.asq);
7431 uint64_t acq = ldq_le_p(&n->bar.acq);
7432 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
7433 uint32_t page_size = 1 << page_bits;
7434 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
7435
7436 if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) {
7437 trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
7438 le16_to_cpu(sctrl->nvq));
7439 return -1;
7440 }
7441 if (unlikely(n->cq[0])) {
7442 trace_pci_nvme_err_startfail_cq();
7443 return -1;
7444 }
7445 if (unlikely(n->sq[0])) {
7446 trace_pci_nvme_err_startfail_sq();
7447 return -1;
7448 }
7449 if (unlikely(asq & (page_size - 1))) {
7450 trace_pci_nvme_err_startfail_asq_misaligned(asq);
7451 return -1;
7452 }
7453 if (unlikely(acq & (page_size - 1))) {
7454 trace_pci_nvme_err_startfail_acq_misaligned(acq);
7455 return -1;
7456 }
7457 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
7458 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
7459 return -1;
7460 }
7461 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
7462 trace_pci_nvme_err_startfail_page_too_small(
7463 NVME_CC_MPS(cc),
7464 NVME_CAP_MPSMIN(cap));
7465 return -1;
7466 }
7467 if (unlikely(NVME_CC_MPS(cc) >
7468 NVME_CAP_MPSMAX(cap))) {
7469 trace_pci_nvme_err_startfail_page_too_large(
7470 NVME_CC_MPS(cc),
7471 NVME_CAP_MPSMAX(cap));
7472 return -1;
7473 }
7474 if (unlikely(!NVME_AQA_ASQS(aqa))) {
7475 trace_pci_nvme_err_startfail_asqent_sz_zero();
7476 return -1;
7477 }
7478 if (unlikely(!NVME_AQA_ACQS(aqa))) {
7479 trace_pci_nvme_err_startfail_acqent_sz_zero();
7480 return -1;
7481 }
7482
7483 n->page_bits = page_bits;
7484 n->page_size = page_size;
7485 n->max_prp_ents = n->page_size / sizeof(uint64_t);
7486 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
7487 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
7488
7489 nvme_set_timestamp(n, 0ULL);
7490
7491 nvme_select_iocs(n);
7492
7493 return 0;
7494 }
7495
nvme_cmb_enable_regs(NvmeCtrl * n)7496 static void nvme_cmb_enable_regs(NvmeCtrl *n)
7497 {
7498 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
7499 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
7500
7501 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
7502 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
7503 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
7504 stl_le_p(&n->bar.cmbloc, cmbloc);
7505
7506 NVME_CMBSZ_SET_SQS(cmbsz, 1);
7507 NVME_CMBSZ_SET_CQS(cmbsz, 0);
7508 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
7509 NVME_CMBSZ_SET_RDS(cmbsz, 1);
7510 NVME_CMBSZ_SET_WDS(cmbsz, 1);
7511 NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
7512 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
7513 stl_le_p(&n->bar.cmbsz, cmbsz);
7514 }
7515
nvme_write_bar(NvmeCtrl * n,hwaddr offset,uint64_t data,unsigned size)7516 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
7517 unsigned size)
7518 {
7519 PCIDevice *pci = PCI_DEVICE(n);
7520 uint64_t cap = ldq_le_p(&n->bar.cap);
7521 uint32_t cc = ldl_le_p(&n->bar.cc);
7522 uint32_t intms = ldl_le_p(&n->bar.intms);
7523 uint32_t csts = ldl_le_p(&n->bar.csts);
7524 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
7525
7526 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
7527 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
7528 "MMIO write not 32-bit aligned,"
7529 " offset=0x%"PRIx64"", offset);
7530 /* should be ignored, fall through for now */
7531 }
7532
7533 if (unlikely(size < sizeof(uint32_t))) {
7534 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
7535 "MMIO write smaller than 32-bits,"
7536 " offset=0x%"PRIx64", size=%u",
7537 offset, size);
7538 /* should be ignored, fall through for now */
7539 }
7540
7541 switch (offset) {
7542 case NVME_REG_INTMS:
7543 if (unlikely(msix_enabled(pci))) {
7544 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7545 "undefined access to interrupt mask set"
7546 " when MSI-X is enabled");
7547 /* should be ignored, fall through for now */
7548 }
7549 intms |= data;
7550 stl_le_p(&n->bar.intms, intms);
7551 n->bar.intmc = n->bar.intms;
7552 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
7553 nvme_irq_check(n);
7554 break;
7555 case NVME_REG_INTMC:
7556 if (unlikely(msix_enabled(pci))) {
7557 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7558 "undefined access to interrupt mask clr"
7559 " when MSI-X is enabled");
7560 /* should be ignored, fall through for now */
7561 }
7562 intms &= ~data;
7563 stl_le_p(&n->bar.intms, intms);
7564 n->bar.intmc = n->bar.intms;
7565 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
7566 nvme_irq_check(n);
7567 break;
7568 case NVME_REG_CC:
7569 stl_le_p(&n->bar.cc, data);
7570
7571 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
7572
7573 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
7574 trace_pci_nvme_mmio_shutdown_set();
7575 nvme_ctrl_shutdown(n);
7576 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7577 csts |= NVME_CSTS_SHST_COMPLETE;
7578 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
7579 trace_pci_nvme_mmio_shutdown_cleared();
7580 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7581 }
7582
7583 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
7584 if (unlikely(nvme_start_ctrl(n))) {
7585 trace_pci_nvme_err_startfail();
7586 csts = NVME_CSTS_FAILED;
7587 } else {
7588 trace_pci_nvme_mmio_start_success();
7589 csts = NVME_CSTS_READY;
7590 }
7591 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
7592 trace_pci_nvme_mmio_stopped();
7593 nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
7594
7595 break;
7596 }
7597
7598 stl_le_p(&n->bar.csts, csts);
7599
7600 break;
7601 case NVME_REG_CSTS:
7602 if (data & (1 << 4)) {
7603 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
7604 "attempted to W1C CSTS.NSSRO"
7605 " but CAP.NSSRS is zero (not supported)");
7606 } else if (data != 0) {
7607 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
7608 "attempted to set a read only bit"
7609 " of controller status");
7610 }
7611 break;
7612 case NVME_REG_NSSR:
7613 if (data == 0x4e564d65) {
7614 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
7615 } else {
7616 /* The spec says that writes of other values have no effect */
7617 return;
7618 }
7619 break;
7620 case NVME_REG_AQA:
7621 stl_le_p(&n->bar.aqa, data);
7622 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
7623 break;
7624 case NVME_REG_ASQ:
7625 stn_le_p(&n->bar.asq, size, data);
7626 trace_pci_nvme_mmio_asqaddr(data);
7627 break;
7628 case NVME_REG_ASQ + 4:
7629 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
7630 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
7631 break;
7632 case NVME_REG_ACQ:
7633 trace_pci_nvme_mmio_acqaddr(data);
7634 stn_le_p(&n->bar.acq, size, data);
7635 break;
7636 case NVME_REG_ACQ + 4:
7637 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
7638 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
7639 break;
7640 case NVME_REG_CMBLOC:
7641 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
7642 "invalid write to reserved CMBLOC"
7643 " when CMBSZ is zero, ignored");
7644 return;
7645 case NVME_REG_CMBSZ:
7646 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
7647 "invalid write to read only CMBSZ, ignored");
7648 return;
7649 case NVME_REG_CMBMSC:
7650 if (!NVME_CAP_CMBS(cap)) {
7651 return;
7652 }
7653
7654 stn_le_p(&n->bar.cmbmsc, size, data);
7655 n->cmb.cmse = false;
7656
7657 if (NVME_CMBMSC_CRE(data)) {
7658 nvme_cmb_enable_regs(n);
7659
7660 if (NVME_CMBMSC_CMSE(data)) {
7661 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
7662 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
7663 if (cba + int128_get64(n->cmb.mem.size) < cba) {
7664 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
7665 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
7666 stl_le_p(&n->bar.cmbsts, cmbsts);
7667 return;
7668 }
7669
7670 n->cmb.cba = cba;
7671 n->cmb.cmse = true;
7672 }
7673 } else {
7674 n->bar.cmbsz = 0;
7675 n->bar.cmbloc = 0;
7676 }
7677
7678 return;
7679 case NVME_REG_CMBMSC + 4:
7680 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
7681 return;
7682
7683 case NVME_REG_PMRCAP:
7684 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
7685 "invalid write to PMRCAP register, ignored");
7686 return;
7687 case NVME_REG_PMRCTL:
7688 if (!NVME_CAP_PMRS(cap)) {
7689 return;
7690 }
7691
7692 stl_le_p(&n->bar.pmrctl, data);
7693 if (NVME_PMRCTL_EN(data)) {
7694 memory_region_set_enabled(&n->pmr.dev->mr, true);
7695 pmrsts = 0;
7696 } else {
7697 memory_region_set_enabled(&n->pmr.dev->mr, false);
7698 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
7699 n->pmr.cmse = false;
7700 }
7701 stl_le_p(&n->bar.pmrsts, pmrsts);
7702 return;
7703 case NVME_REG_PMRSTS:
7704 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
7705 "invalid write to PMRSTS register, ignored");
7706 return;
7707 case NVME_REG_PMREBS:
7708 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
7709 "invalid write to PMREBS register, ignored");
7710 return;
7711 case NVME_REG_PMRSWTP:
7712 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
7713 "invalid write to PMRSWTP register, ignored");
7714 return;
7715 case NVME_REG_PMRMSCL:
7716 if (!NVME_CAP_PMRS(cap)) {
7717 return;
7718 }
7719
7720 stl_le_p(&n->bar.pmrmscl, data);
7721 n->pmr.cmse = false;
7722
7723 if (NVME_PMRMSCL_CMSE(data)) {
7724 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
7725 hwaddr cba = pmrmscu << 32 |
7726 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
7727 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
7728 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
7729 stl_le_p(&n->bar.pmrsts, pmrsts);
7730 return;
7731 }
7732
7733 n->pmr.cmse = true;
7734 n->pmr.cba = cba;
7735 }
7736
7737 return;
7738 case NVME_REG_PMRMSCU:
7739 if (!NVME_CAP_PMRS(cap)) {
7740 return;
7741 }
7742
7743 stl_le_p(&n->bar.pmrmscu, data);
7744 return;
7745 default:
7746 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
7747 "invalid MMIO write,"
7748 " offset=0x%"PRIx64", data=%"PRIx64"",
7749 offset, data);
7750 break;
7751 }
7752 }
7753
nvme_mmio_read(void * opaque,hwaddr addr,unsigned size)7754 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
7755 {
7756 NvmeCtrl *n = (NvmeCtrl *)opaque;
7757 uint8_t *ptr = (uint8_t *)&n->bar;
7758
7759 trace_pci_nvme_mmio_read(addr, size);
7760
7761 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
7762 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
7763 "MMIO read not 32-bit aligned,"
7764 " offset=0x%"PRIx64"", addr);
7765 /* should RAZ, fall through for now */
7766 } else if (unlikely(size < sizeof(uint32_t))) {
7767 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
7768 "MMIO read smaller than 32-bits,"
7769 " offset=0x%"PRIx64"", addr);
7770 /* should RAZ, fall through for now */
7771 }
7772
7773 if (addr > sizeof(n->bar) - size) {
7774 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
7775 "MMIO read beyond last register,"
7776 " offset=0x%"PRIx64", returning 0", addr);
7777
7778 return 0;
7779 }
7780
7781 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
7782 addr != NVME_REG_CSTS) {
7783 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
7784 return 0;
7785 }
7786
7787 /*
7788 * When PMRWBM bit 1 is set then read from
7789 * from PMRSTS should ensure prior writes
7790 * made it to persistent media
7791 */
7792 if (addr == NVME_REG_PMRSTS &&
7793 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
7794 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7795 }
7796
7797 return ldn_le_p(ptr + addr, size);
7798 }
7799
nvme_process_db(NvmeCtrl * n,hwaddr addr,int val)7800 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
7801 {
7802 PCIDevice *pci = PCI_DEVICE(n);
7803 uint32_t qid;
7804
7805 if (unlikely(addr & ((1 << 2) - 1))) {
7806 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
7807 "doorbell write not 32-bit aligned,"
7808 " offset=0x%"PRIx64", ignoring", addr);
7809 return;
7810 }
7811
7812 if (((addr - 0x1000) >> 2) & 1) {
7813 /* Completion queue doorbell write */
7814
7815 uint16_t new_head = val & 0xffff;
7816 NvmeCQueue *cq;
7817
7818 qid = (addr - (0x1000 + (1 << 2))) >> 3;
7819 if (unlikely(nvme_check_cqid(n, qid))) {
7820 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
7821 "completion queue doorbell write"
7822 " for nonexistent queue,"
7823 " sqid=%"PRIu32", ignoring", qid);
7824
7825 /*
7826 * NVM Express v1.3d, Section 4.1 state: "If host software writes
7827 * an invalid value to the Submission Queue Tail Doorbell or
7828 * Completion Queue Head Doorbell register and an Asynchronous Event
7829 * Request command is outstanding, then an asynchronous event is
7830 * posted to the Admin Completion Queue with a status code of
7831 * Invalid Doorbell Write Value."
7832 *
7833 * Also note that the spec includes the "Invalid Doorbell Register"
7834 * status code, but nowhere does it specify when to use it.
7835 * However, it seems reasonable to use it here in a similar
7836 * fashion.
7837 */
7838 if (n->outstanding_aers) {
7839 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7840 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
7841 NVME_LOG_ERROR_INFO);
7842 }
7843
7844 return;
7845 }
7846
7847 cq = n->cq[qid];
7848 if (unlikely(new_head >= cq->size)) {
7849 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
7850 "completion queue doorbell write value"
7851 " beyond queue size, sqid=%"PRIu32","
7852 " new_head=%"PRIu16", ignoring",
7853 qid, new_head);
7854
7855 if (n->outstanding_aers) {
7856 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7857 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
7858 NVME_LOG_ERROR_INFO);
7859 }
7860
7861 return;
7862 }
7863
7864 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
7865
7866 /* scheduled deferred cqe posting if queue was previously full */
7867 if (nvme_cq_full(cq)) {
7868 qemu_bh_schedule(cq->bh);
7869 }
7870
7871 cq->head = new_head;
7872 if (!qid && n->dbbuf_enabled) {
7873 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
7874 }
7875
7876 if (cq->tail == cq->head) {
7877 if (cq->irq_enabled) {
7878 n->cq_pending--;
7879 }
7880
7881 nvme_irq_deassert(n, cq);
7882 }
7883 } else {
7884 /* Submission queue doorbell write */
7885
7886 uint16_t new_tail = val & 0xffff;
7887 NvmeSQueue *sq;
7888
7889 qid = (addr - 0x1000) >> 3;
7890 if (unlikely(nvme_check_sqid(n, qid))) {
7891 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
7892 "submission queue doorbell write"
7893 " for nonexistent queue,"
7894 " sqid=%"PRIu32", ignoring", qid);
7895
7896 if (n->outstanding_aers) {
7897 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7898 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
7899 NVME_LOG_ERROR_INFO);
7900 }
7901
7902 return;
7903 }
7904
7905 sq = n->sq[qid];
7906 if (unlikely(new_tail >= sq->size)) {
7907 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
7908 "submission queue doorbell write value"
7909 " beyond queue size, sqid=%"PRIu32","
7910 " new_tail=%"PRIu16", ignoring",
7911 qid, new_tail);
7912
7913 if (n->outstanding_aers) {
7914 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7915 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
7916 NVME_LOG_ERROR_INFO);
7917 }
7918
7919 return;
7920 }
7921
7922 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
7923
7924 sq->tail = new_tail;
7925 if (!qid && n->dbbuf_enabled) {
7926 /*
7927 * The spec states "the host shall also update the controller's
7928 * corresponding doorbell property to match the value of that entry
7929 * in the Shadow Doorbell buffer."
7930 *
7931 * Since this context is currently a VM trap, we can safely enforce
7932 * the requirement from the device side in case the host is
7933 * misbehaving.
7934 *
7935 * Note, we shouldn't have to do this, but various drivers
7936 * including ones that run on Linux, are not updating Admin Queues,
7937 * so we can't trust reading it for an appropriate sq tail.
7938 */
7939 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
7940 }
7941
7942 qemu_bh_schedule(sq->bh);
7943 }
7944 }
7945
nvme_mmio_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)7946 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
7947 unsigned size)
7948 {
7949 NvmeCtrl *n = (NvmeCtrl *)opaque;
7950
7951 trace_pci_nvme_mmio_write(addr, data, size);
7952
7953 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
7954 addr != NVME_REG_CSTS) {
7955 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
7956 return;
7957 }
7958
7959 if (addr < sizeof(n->bar)) {
7960 nvme_write_bar(n, addr, data, size);
7961 } else {
7962 nvme_process_db(n, addr, data);
7963 }
7964 }
7965
7966 static const MemoryRegionOps nvme_mmio_ops = {
7967 .read = nvme_mmio_read,
7968 .write = nvme_mmio_write,
7969 .endianness = DEVICE_LITTLE_ENDIAN,
7970 .impl = {
7971 .min_access_size = 2,
7972 .max_access_size = 8,
7973 },
7974 };
7975
nvme_cmb_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)7976 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
7977 unsigned size)
7978 {
7979 NvmeCtrl *n = (NvmeCtrl *)opaque;
7980 stn_le_p(&n->cmb.buf[addr], size, data);
7981 }
7982
nvme_cmb_read(void * opaque,hwaddr addr,unsigned size)7983 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
7984 {
7985 NvmeCtrl *n = (NvmeCtrl *)opaque;
7986 return ldn_le_p(&n->cmb.buf[addr], size);
7987 }
7988
7989 static const MemoryRegionOps nvme_cmb_ops = {
7990 .read = nvme_cmb_read,
7991 .write = nvme_cmb_write,
7992 .endianness = DEVICE_LITTLE_ENDIAN,
7993 .impl = {
7994 .min_access_size = 1,
7995 .max_access_size = 8,
7996 },
7997 };
7998
nvme_check_params(NvmeCtrl * n,Error ** errp)7999 static bool nvme_check_params(NvmeCtrl *n, Error **errp)
8000 {
8001 NvmeParams *params = &n->params;
8002
8003 if (params->num_queues) {
8004 warn_report("num_queues is deprecated; please use max_ioqpairs "
8005 "instead");
8006
8007 params->max_ioqpairs = params->num_queues - 1;
8008 }
8009
8010 if (n->namespace.blkconf.blk && n->subsys) {
8011 error_setg(errp, "subsystem support is unavailable with legacy "
8012 "namespace ('drive' property)");
8013 return false;
8014 }
8015
8016 if (params->max_ioqpairs < 1 ||
8017 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
8018 error_setg(errp, "max_ioqpairs must be between 1 and %d",
8019 NVME_MAX_IOQPAIRS);
8020 return false;
8021 }
8022
8023 if (params->msix_qsize < 1 ||
8024 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
8025 error_setg(errp, "msix_qsize must be between 1 and %d",
8026 PCI_MSIX_FLAGS_QSIZE + 1);
8027 return false;
8028 }
8029
8030 if (!params->serial) {
8031 error_setg(errp, "serial property not set");
8032 return false;
8033 }
8034
8035 if (params->mqes < 1) {
8036 error_setg(errp, "mqes property cannot be less than 1");
8037 return false;
8038 }
8039
8040 if (n->pmr.dev) {
8041 if (params->msix_exclusive_bar) {
8042 error_setg(errp, "not enough BARs available to enable PMR");
8043 return false;
8044 }
8045
8046 if (host_memory_backend_is_mapped(n->pmr.dev)) {
8047 error_setg(errp, "can't use already busy memdev: %s",
8048 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
8049 return false;
8050 }
8051
8052 if (!is_power_of_2(n->pmr.dev->size)) {
8053 error_setg(errp, "pmr backend size needs to be power of 2 in size");
8054 return false;
8055 }
8056
8057 host_memory_backend_set_mapped(n->pmr.dev, true);
8058 }
8059
8060 if (n->params.zasl > n->params.mdts) {
8061 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
8062 "than or equal to mdts (Maximum Data Transfer Size)");
8063 return false;
8064 }
8065
8066 if (!n->params.vsl) {
8067 error_setg(errp, "vsl must be non-zero");
8068 return false;
8069 }
8070
8071 if (params->sriov_max_vfs) {
8072 if (!n->subsys) {
8073 error_setg(errp, "subsystem is required for the use of SR-IOV");
8074 return false;
8075 }
8076
8077 if (params->cmb_size_mb) {
8078 error_setg(errp, "CMB is not supported with SR-IOV");
8079 return false;
8080 }
8081
8082 if (n->pmr.dev) {
8083 error_setg(errp, "PMR is not supported with SR-IOV");
8084 return false;
8085 }
8086
8087 if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
8088 error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
8089 " must be set for the use of SR-IOV");
8090 return false;
8091 }
8092
8093 if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
8094 error_setg(errp, "sriov_vq_flexible must be greater than or equal"
8095 " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2);
8096 return false;
8097 }
8098
8099 if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
8100 error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be"
8101 " greater than or equal to 2");
8102 return false;
8103 }
8104
8105 if (params->sriov_vi_flexible < params->sriov_max_vfs) {
8106 error_setg(errp, "sriov_vi_flexible must be greater than or equal"
8107 " to %d (sriov_max_vfs)", params->sriov_max_vfs);
8108 return false;
8109 }
8110
8111 if (params->msix_qsize < params->sriov_vi_flexible + 1) {
8112 error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be"
8113 " greater than or equal to 1");
8114 return false;
8115 }
8116
8117 if (params->sriov_max_vi_per_vf &&
8118 (params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
8119 error_setg(errp, "sriov_max_vi_per_vf must meet:"
8120 " (sriov_max_vi_per_vf - 1) %% %d == 0 and"
8121 " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY);
8122 return false;
8123 }
8124
8125 if (params->sriov_max_vq_per_vf &&
8126 (params->sriov_max_vq_per_vf < 2 ||
8127 (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
8128 error_setg(errp, "sriov_max_vq_per_vf must meet:"
8129 " (sriov_max_vq_per_vf - 1) %% %d == 0 and"
8130 " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY);
8131 return false;
8132 }
8133 }
8134
8135 return true;
8136 }
8137
nvme_init_state(NvmeCtrl * n)8138 static void nvme_init_state(NvmeCtrl *n)
8139 {
8140 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8141 NvmeSecCtrlEntry *list = n->sec_ctrl_list;
8142 NvmeSecCtrlEntry *sctrl;
8143 PCIDevice *pci = PCI_DEVICE(n);
8144 uint8_t max_vfs;
8145 int i;
8146
8147 if (pci_is_vf(pci)) {
8148 sctrl = nvme_sctrl(n);
8149 max_vfs = 0;
8150 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
8151 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
8152 } else {
8153 max_vfs = n->params.sriov_max_vfs;
8154 n->conf_ioqpairs = n->params.max_ioqpairs;
8155 n->conf_msix_qsize = n->params.msix_qsize;
8156 }
8157
8158 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
8159 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
8160 n->temperature = NVME_TEMPERATURE;
8161 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
8162 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
8163 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
8164 QTAILQ_INIT(&n->aer_queue);
8165
8166 n->nr_sec_ctrls = max_vfs;
8167 for (i = 0; i < max_vfs; i++) {
8168 sctrl = &list[i];
8169 sctrl->pcid = cpu_to_le16(n->cntlid);
8170 sctrl->vfn = cpu_to_le16(i + 1);
8171 }
8172
8173 cap->cntlid = cpu_to_le16(n->cntlid);
8174 cap->crt = NVME_CRT_VQ | NVME_CRT_VI;
8175
8176 if (pci_is_vf(pci)) {
8177 cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs);
8178 } else {
8179 cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs -
8180 n->params.sriov_vq_flexible);
8181 cap->vqfrt = cpu_to_le32(n->params.sriov_vq_flexible);
8182 cap->vqrfap = cap->vqfrt;
8183 cap->vqgran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8184 cap->vqfrsm = n->params.sriov_max_vq_per_vf ?
8185 cpu_to_le16(n->params.sriov_max_vq_per_vf) :
8186 cap->vqfrt / MAX(max_vfs, 1);
8187 }
8188
8189 if (pci_is_vf(pci)) {
8190 cap->viprt = cpu_to_le16(n->conf_msix_qsize);
8191 } else {
8192 cap->viprt = cpu_to_le16(n->params.msix_qsize -
8193 n->params.sriov_vi_flexible);
8194 cap->vifrt = cpu_to_le32(n->params.sriov_vi_flexible);
8195 cap->virfap = cap->vifrt;
8196 cap->vigran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8197 cap->vifrsm = n->params.sriov_max_vi_per_vf ?
8198 cpu_to_le16(n->params.sriov_max_vi_per_vf) :
8199 cap->vifrt / MAX(max_vfs, 1);
8200 }
8201 }
8202
nvme_init_cmb(NvmeCtrl * n,PCIDevice * pci_dev)8203 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
8204 {
8205 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
8206 uint64_t cap = ldq_le_p(&n->bar.cap);
8207
8208 n->cmb.buf = g_malloc0(cmb_size);
8209 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
8210 "nvme-cmb", cmb_size);
8211 pci_register_bar(pci_dev, NVME_CMB_BIR,
8212 PCI_BASE_ADDRESS_SPACE_MEMORY |
8213 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8214 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
8215
8216 NVME_CAP_SET_CMBS(cap, 1);
8217 stq_le_p(&n->bar.cap, cap);
8218
8219 if (n->params.legacy_cmb) {
8220 nvme_cmb_enable_regs(n);
8221 n->cmb.cmse = true;
8222 }
8223 }
8224
nvme_init_pmr(NvmeCtrl * n,PCIDevice * pci_dev)8225 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
8226 {
8227 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
8228
8229 NVME_PMRCAP_SET_RDS(pmrcap, 1);
8230 NVME_PMRCAP_SET_WDS(pmrcap, 1);
8231 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
8232 /* Turn on bit 1 support */
8233 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
8234 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
8235 stl_le_p(&n->bar.pmrcap, pmrcap);
8236
8237 pci_register_bar(pci_dev, NVME_PMR_BIR,
8238 PCI_BASE_ADDRESS_SPACE_MEMORY |
8239 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8240 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
8241
8242 memory_region_set_enabled(&n->pmr.dev->mr, false);
8243 }
8244
nvme_mbar_size(unsigned total_queues,unsigned total_irqs,unsigned * msix_table_offset,unsigned * msix_pba_offset)8245 static uint64_t nvme_mbar_size(unsigned total_queues, unsigned total_irqs,
8246 unsigned *msix_table_offset,
8247 unsigned *msix_pba_offset)
8248 {
8249 uint64_t bar_size, msix_table_size;
8250
8251 bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
8252
8253 if (total_irqs == 0) {
8254 goto out;
8255 }
8256
8257 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8258
8259 if (msix_table_offset) {
8260 *msix_table_offset = bar_size;
8261 }
8262
8263 msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
8264 bar_size += msix_table_size;
8265 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8266
8267 if (msix_pba_offset) {
8268 *msix_pba_offset = bar_size;
8269 }
8270
8271 bar_size += QEMU_ALIGN_UP(total_irqs, 64) / 8;
8272
8273 out:
8274 return pow2ceil(bar_size);
8275 }
8276
nvme_init_sriov(NvmeCtrl * n,PCIDevice * pci_dev,uint16_t offset)8277 static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset)
8278 {
8279 uint16_t vf_dev_id = n->params.use_intel_id ?
8280 PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
8281 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8282 uint64_t bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm),
8283 le16_to_cpu(cap->vifrsm),
8284 NULL, NULL);
8285
8286 pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
8287 n->params.sriov_max_vfs, n->params.sriov_max_vfs,
8288 NVME_VF_OFFSET, NVME_VF_STRIDE);
8289
8290 pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8291 PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
8292 }
8293
nvme_add_pm_capability(PCIDevice * pci_dev,uint8_t offset)8294 static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
8295 {
8296 Error *err = NULL;
8297 int ret;
8298
8299 ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
8300 PCI_PM_SIZEOF, &err);
8301 if (err) {
8302 error_report_err(err);
8303 return ret;
8304 }
8305
8306 pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
8307 PCI_PM_CAP_VER_1_2);
8308 pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
8309 PCI_PM_CTRL_NO_SOFT_RESET);
8310 pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
8311 PCI_PM_CTRL_STATE_MASK);
8312
8313 return 0;
8314 }
8315
pcie_doe_spdm_rsp(DOECap * doe_cap)8316 static bool pcie_doe_spdm_rsp(DOECap *doe_cap)
8317 {
8318 void *req = pcie_doe_get_write_mbox_ptr(doe_cap);
8319 uint32_t req_len = pcie_doe_get_obj_len(req) * 4;
8320 void *rsp = doe_cap->read_mbox;
8321 uint32_t rsp_len = SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE;
8322
8323 uint32_t recvd = spdm_socket_rsp(doe_cap->spdm_socket,
8324 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE,
8325 req, req_len, rsp, rsp_len);
8326 doe_cap->read_mbox_len += DIV_ROUND_UP(recvd, 4);
8327
8328 return recvd != 0;
8329 }
8330
8331 static DOEProtocol doe_spdm_prot[] = {
8332 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_CMA, pcie_doe_spdm_rsp },
8333 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_SECURED_CMA, pcie_doe_spdm_rsp },
8334 { }
8335 };
8336
nvme_init_pci(NvmeCtrl * n,PCIDevice * pci_dev,Error ** errp)8337 static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
8338 {
8339 ERRP_GUARD();
8340 uint8_t *pci_conf = pci_dev->config;
8341 uint64_t bar_size;
8342 unsigned msix_table_offset = 0, msix_pba_offset = 0;
8343 unsigned nr_vectors;
8344 int ret;
8345
8346 pci_conf[PCI_INTERRUPT_PIN] = 1;
8347 pci_config_set_prog_interface(pci_conf, 0x2);
8348
8349 if (n->params.use_intel_id) {
8350 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
8351 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
8352 } else {
8353 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
8354 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
8355 }
8356
8357 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
8358 nvme_add_pm_capability(pci_dev, 0x60);
8359 pcie_endpoint_cap_init(pci_dev, 0x80);
8360 pcie_cap_flr_init(pci_dev);
8361 if (n->params.sriov_max_vfs) {
8362 pcie_ari_init(pci_dev, 0x100);
8363 }
8364
8365 if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) {
8366 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, 0, NULL, NULL);
8367 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8368 bar_size);
8369 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8370 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
8371 ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp);
8372 } else {
8373 assert(n->params.msix_qsize >= 1);
8374
8375 /* add one to max_ioqpairs to account for the admin queue pair */
8376 if (!pci_is_vf(pci_dev)) {
8377 nr_vectors = n->params.msix_qsize;
8378 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1,
8379 nr_vectors, &msix_table_offset,
8380 &msix_pba_offset);
8381 } else {
8382 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8383 NvmePriCtrlCap *cap = &pn->pri_ctrl_cap;
8384
8385 nr_vectors = le16_to_cpu(cap->vifrsm);
8386 bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm), nr_vectors,
8387 &msix_table_offset, &msix_pba_offset);
8388 }
8389
8390 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
8391 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8392 msix_table_offset);
8393 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
8394
8395 if (pci_is_vf(pci_dev)) {
8396 pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
8397 } else {
8398 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8399 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
8400 }
8401
8402 ret = msix_init(pci_dev, nr_vectors,
8403 &n->bar0, 0, msix_table_offset,
8404 &n->bar0, 0, msix_pba_offset, 0, errp);
8405 }
8406
8407 if (ret == -ENOTSUP) {
8408 /* report that msix is not supported, but do not error out */
8409 warn_report_err(*errp);
8410 *errp = NULL;
8411 } else if (ret < 0) {
8412 /* propagate error to caller */
8413 return false;
8414 }
8415
8416 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
8417
8418 pcie_cap_deverr_init(pci_dev);
8419
8420 /* DOE Initialisation */
8421 if (pci_dev->spdm_port) {
8422 uint16_t doe_offset = n->params.sriov_max_vfs ?
8423 PCI_CONFIG_SPACE_SIZE + PCI_ARI_SIZEOF
8424 : PCI_CONFIG_SPACE_SIZE;
8425
8426 pcie_doe_init(pci_dev, &pci_dev->doe_spdm, doe_offset,
8427 doe_spdm_prot, true, 0);
8428
8429 pci_dev->doe_spdm.spdm_socket = spdm_socket_connect(pci_dev->spdm_port,
8430 errp);
8431
8432 if (pci_dev->doe_spdm.spdm_socket < 0) {
8433 return false;
8434 }
8435 }
8436
8437 if (n->params.cmb_size_mb) {
8438 nvme_init_cmb(n, pci_dev);
8439 }
8440
8441 if (n->pmr.dev) {
8442 nvme_init_pmr(n, pci_dev);
8443 }
8444
8445 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
8446 nvme_init_sriov(n, pci_dev, 0x120);
8447 }
8448
8449 return true;
8450 }
8451
nvme_init_subnqn(NvmeCtrl * n)8452 static void nvme_init_subnqn(NvmeCtrl *n)
8453 {
8454 NvmeSubsystem *subsys = n->subsys;
8455 NvmeIdCtrl *id = &n->id_ctrl;
8456
8457 if (!subsys) {
8458 snprintf((char *)id->subnqn, sizeof(id->subnqn),
8459 "nqn.2019-08.org.qemu:%s", n->params.serial);
8460 } else {
8461 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
8462 }
8463 }
8464
nvme_init_ctrl(NvmeCtrl * n,PCIDevice * pci_dev)8465 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
8466 {
8467 NvmeIdCtrl *id = &n->id_ctrl;
8468 uint8_t *pci_conf = pci_dev->config;
8469 uint64_t cap = ldq_le_p(&n->bar.cap);
8470 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
8471 uint32_t ctratt;
8472
8473 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
8474 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
8475 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
8476 strpadcpy((char *)id->fr, sizeof(id->fr), QEMU_VERSION, ' ');
8477 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
8478
8479 id->cntlid = cpu_to_le16(n->cntlid);
8480
8481 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
8482 ctratt = NVME_CTRATT_ELBAS;
8483
8484 id->rab = 6;
8485
8486 if (n->params.use_intel_id) {
8487 id->ieee[0] = 0xb3;
8488 id->ieee[1] = 0x02;
8489 id->ieee[2] = 0x00;
8490 } else {
8491 id->ieee[0] = 0x00;
8492 id->ieee[1] = 0x54;
8493 id->ieee[2] = 0x52;
8494 }
8495
8496 id->mdts = n->params.mdts;
8497 id->ver = cpu_to_le32(NVME_SPEC_VER);
8498 id->oacs =
8499 cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF |
8500 NVME_OACS_DIRECTIVES);
8501 id->cntrltype = 0x1;
8502
8503 /*
8504 * Because the controller always completes the Abort command immediately,
8505 * there can never be more than one concurrently executing Abort command,
8506 * so this value is never used for anything. Note that there can easily be
8507 * many Abort commands in the queues, but they are not considered
8508 * "executing" until processed by nvme_abort.
8509 *
8510 * The specification recommends a value of 3 for Abort Command Limit (four
8511 * concurrently outstanding Abort commands), so lets use that though it is
8512 * inconsequential.
8513 */
8514 id->acl = 3;
8515 id->aerl = n->params.aerl;
8516 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
8517 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
8518
8519 /* recommended default value (~70 C) */
8520 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
8521 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
8522
8523 id->sqes = (NVME_SQES << 4) | NVME_SQES;
8524 id->cqes = (NVME_CQES << 4) | NVME_CQES;
8525 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
8526 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
8527 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
8528 NVME_ONCS_COMPARE | NVME_ONCS_COPY |
8529 NVME_ONCS_NVMCSA | NVME_ONCS_NVMAFC);
8530
8531 /*
8532 * NOTE: If this device ever supports a command set that does NOT use 0x0
8533 * as a Flush-equivalent operation, support for the broadcast NSID in Flush
8534 * should probably be removed.
8535 *
8536 * See comment in nvme_io_cmd.
8537 */
8538 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
8539
8540 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1 |
8541 NVME_OCFS_COPY_FORMAT_2 | NVME_OCFS_COPY_FORMAT_3);
8542 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN);
8543
8544 nvme_init_subnqn(n);
8545
8546 id->psd[0].mp = cpu_to_le16(0x9c4);
8547 id->psd[0].enlat = cpu_to_le32(0x10);
8548 id->psd[0].exlat = cpu_to_le32(0x4);
8549
8550 if (n->subsys) {
8551 id->cmic |= NVME_CMIC_MULTI_CTRL;
8552 ctratt |= NVME_CTRATT_ENDGRPS;
8553
8554 id->endgidmax = cpu_to_le16(0x1);
8555
8556 if (n->subsys->endgrp.fdp.enabled) {
8557 ctratt |= NVME_CTRATT_FDPS;
8558 }
8559 }
8560
8561 id->ctratt = cpu_to_le32(ctratt);
8562
8563 NVME_CAP_SET_MQES(cap, n->params.mqes);
8564 NVME_CAP_SET_CQR(cap, 1);
8565 NVME_CAP_SET_TO(cap, 0xf);
8566 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
8567 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
8568 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
8569 NVME_CAP_SET_MPSMAX(cap, 4);
8570 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
8571 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
8572 stq_le_p(&n->bar.cap, cap);
8573
8574 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
8575 n->bar.intmc = n->bar.intms = 0;
8576
8577 if (pci_is_vf(pci_dev) && !sctrl->scs) {
8578 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
8579 }
8580 }
8581
nvme_init_subsys(NvmeCtrl * n,Error ** errp)8582 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
8583 {
8584 int cntlid;
8585
8586 if (!n->subsys) {
8587 return 0;
8588 }
8589
8590 cntlid = nvme_subsys_register_ctrl(n, errp);
8591 if (cntlid < 0) {
8592 return -1;
8593 }
8594
8595 n->cntlid = cntlid;
8596
8597 return 0;
8598 }
8599
nvme_attach_ns(NvmeCtrl * n,NvmeNamespace * ns)8600 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
8601 {
8602 uint32_t nsid = ns->params.nsid;
8603 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
8604
8605 n->namespaces[nsid] = ns;
8606 ns->attached++;
8607
8608 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
8609 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
8610 }
8611
nvme_realize(PCIDevice * pci_dev,Error ** errp)8612 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
8613 {
8614 NvmeCtrl *n = NVME(pci_dev);
8615 DeviceState *dev = DEVICE(pci_dev);
8616 NvmeNamespace *ns;
8617 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8618
8619 if (pci_is_vf(pci_dev)) {
8620 /*
8621 * VFs derive settings from the parent. PF's lifespan exceeds
8622 * that of VF's.
8623 */
8624 memcpy(&n->params, &pn->params, sizeof(NvmeParams));
8625
8626 /*
8627 * Set PF's serial value to a new string memory to prevent 'serial'
8628 * property object release of PF when a VF is removed from the system.
8629 */
8630 n->params.serial = g_strdup(pn->params.serial);
8631 n->subsys = pn->subsys;
8632 }
8633
8634 if (!nvme_check_params(n, errp)) {
8635 return;
8636 }
8637
8638 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
8639
8640 if (nvme_init_subsys(n, errp)) {
8641 return;
8642 }
8643 nvme_init_state(n);
8644 if (!nvme_init_pci(n, pci_dev, errp)) {
8645 return;
8646 }
8647 nvme_init_ctrl(n, pci_dev);
8648
8649 /* setup a namespace if the controller drive property was given */
8650 if (n->namespace.blkconf.blk) {
8651 ns = &n->namespace;
8652 ns->params.nsid = 1;
8653
8654 if (nvme_ns_setup(ns, errp)) {
8655 return;
8656 }
8657
8658 nvme_attach_ns(n, ns);
8659 }
8660 }
8661
nvme_exit(PCIDevice * pci_dev)8662 static void nvme_exit(PCIDevice *pci_dev)
8663 {
8664 NvmeCtrl *n = NVME(pci_dev);
8665 NvmeNamespace *ns;
8666 int i;
8667
8668 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
8669
8670 if (n->subsys) {
8671 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
8672 ns = nvme_ns(n, i);
8673 if (ns) {
8674 ns->attached--;
8675 }
8676 }
8677
8678 nvme_subsys_unregister_ctrl(n->subsys, n);
8679 }
8680
8681 g_free(n->cq);
8682 g_free(n->sq);
8683 g_free(n->aer_reqs);
8684
8685 if (n->params.cmb_size_mb) {
8686 g_free(n->cmb.buf);
8687 }
8688
8689 if (pci_dev->doe_spdm.spdm_socket > 0) {
8690 spdm_socket_close(pci_dev->doe_spdm.spdm_socket,
8691 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE);
8692 }
8693
8694 if (n->pmr.dev) {
8695 host_memory_backend_set_mapped(n->pmr.dev, false);
8696 }
8697
8698 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
8699 pcie_sriov_pf_exit(pci_dev);
8700 }
8701
8702 msix_uninit(pci_dev, &n->bar0, &n->bar0);
8703 memory_region_del_subregion(&n->bar0, &n->iomem);
8704 }
8705
8706 static Property nvme_props[] = {
8707 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
8708 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
8709 HostMemoryBackend *),
8710 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
8711 NvmeSubsystem *),
8712 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
8713 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
8714 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
8715 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
8716 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
8717 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
8718 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
8719 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
8720 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
8721 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
8722 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
8723 DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
8724 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
8725 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
8726 params.auto_transition_zones, true),
8727 DEFINE_PROP_UINT16("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
8728 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
8729 params.sriov_vq_flexible, 0),
8730 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
8731 params.sriov_vi_flexible, 0),
8732 DEFINE_PROP_UINT32("sriov_max_vi_per_vf", NvmeCtrl,
8733 params.sriov_max_vi_per_vf, 0),
8734 DEFINE_PROP_UINT32("sriov_max_vq_per_vf", NvmeCtrl,
8735 params.sriov_max_vq_per_vf, 0),
8736 DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar,
8737 false),
8738 DEFINE_PROP_UINT16("mqes", NvmeCtrl, params.mqes, 0x7ff),
8739 DEFINE_PROP_UINT16("spdm_port", PCIDevice, spdm_port, 0),
8740 DEFINE_PROP_END_OF_LIST(),
8741 };
8742
nvme_get_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)8743 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
8744 void *opaque, Error **errp)
8745 {
8746 NvmeCtrl *n = NVME(obj);
8747 uint8_t value = n->smart_critical_warning;
8748
8749 visit_type_uint8(v, name, &value, errp);
8750 }
8751
nvme_set_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)8752 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
8753 void *opaque, Error **errp)
8754 {
8755 NvmeCtrl *n = NVME(obj);
8756 uint8_t value, old_value, cap = 0, index, event;
8757
8758 if (!visit_type_uint8(v, name, &value, errp)) {
8759 return;
8760 }
8761
8762 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
8763 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
8764 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
8765 cap |= NVME_SMART_PMR_UNRELIABLE;
8766 }
8767
8768 if ((value & cap) != value) {
8769 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
8770 value & ~cap);
8771 return;
8772 }
8773
8774 old_value = n->smart_critical_warning;
8775 n->smart_critical_warning = value;
8776
8777 /* only inject new bits of smart critical warning */
8778 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
8779 event = 1 << index;
8780 if (value & ~old_value & event)
8781 nvme_smart_event(n, event);
8782 }
8783 }
8784
nvme_pci_reset(DeviceState * qdev)8785 static void nvme_pci_reset(DeviceState *qdev)
8786 {
8787 PCIDevice *pci_dev = PCI_DEVICE(qdev);
8788 NvmeCtrl *n = NVME(pci_dev);
8789
8790 trace_pci_nvme_pci_reset();
8791 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
8792 }
8793
nvme_sriov_post_write_config(PCIDevice * dev,uint16_t old_num_vfs)8794 static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs)
8795 {
8796 NvmeCtrl *n = NVME(dev);
8797 NvmeSecCtrlEntry *sctrl;
8798 int i;
8799
8800 for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) {
8801 sctrl = &n->sec_ctrl_list[i];
8802 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
8803 }
8804 }
8805
nvme_pci_write_config(PCIDevice * dev,uint32_t address,uint32_t val,int len)8806 static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
8807 uint32_t val, int len)
8808 {
8809 uint16_t old_num_vfs = pcie_sriov_num_vfs(dev);
8810
8811 if (pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
8812 pcie_doe_write_config(&dev->doe_spdm, address, val, len);
8813 }
8814 pci_default_write_config(dev, address, val, len);
8815 pcie_cap_flr_write_config(dev, address, val, len);
8816 nvme_sriov_post_write_config(dev, old_num_vfs);
8817 }
8818
nvme_pci_read_config(PCIDevice * dev,uint32_t address,int len)8819 static uint32_t nvme_pci_read_config(PCIDevice *dev, uint32_t address, int len)
8820 {
8821 uint32_t val;
8822 if (dev->spdm_port && pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
8823 if (pcie_doe_read_config(&dev->doe_spdm, address, len, &val)) {
8824 return val;
8825 }
8826 }
8827 return pci_default_read_config(dev, address, len);
8828 }
8829
8830 static const VMStateDescription nvme_vmstate = {
8831 .name = "nvme",
8832 .unmigratable = 1,
8833 };
8834
nvme_class_init(ObjectClass * oc,void * data)8835 static void nvme_class_init(ObjectClass *oc, void *data)
8836 {
8837 DeviceClass *dc = DEVICE_CLASS(oc);
8838 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
8839
8840 pc->realize = nvme_realize;
8841 pc->config_write = nvme_pci_write_config;
8842 pc->config_read = nvme_pci_read_config;
8843 pc->exit = nvme_exit;
8844 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
8845 pc->revision = 2;
8846
8847 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
8848 dc->desc = "Non-Volatile Memory Express";
8849 device_class_set_props(dc, nvme_props);
8850 dc->vmsd = &nvme_vmstate;
8851 dc->reset = nvme_pci_reset;
8852 }
8853
nvme_instance_init(Object * obj)8854 static void nvme_instance_init(Object *obj)
8855 {
8856 NvmeCtrl *n = NVME(obj);
8857
8858 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
8859 "bootindex", "/namespace@1,0",
8860 DEVICE(obj));
8861
8862 object_property_add(obj, "smart_critical_warning", "uint8",
8863 nvme_get_smart_warning,
8864 nvme_set_smart_warning, NULL, NULL);
8865 }
8866
8867 static const TypeInfo nvme_info = {
8868 .name = TYPE_NVME,
8869 .parent = TYPE_PCI_DEVICE,
8870 .instance_size = sizeof(NvmeCtrl),
8871 .instance_init = nvme_instance_init,
8872 .class_init = nvme_class_init,
8873 .interfaces = (InterfaceInfo[]) {
8874 { INTERFACE_PCIE_DEVICE },
8875 { }
8876 },
8877 };
8878
8879 static const TypeInfo nvme_bus_info = {
8880 .name = TYPE_NVME_BUS,
8881 .parent = TYPE_BUS,
8882 .instance_size = sizeof(NvmeBus),
8883 };
8884
nvme_register_types(void)8885 static void nvme_register_types(void)
8886 {
8887 type_register_static(&nvme_info);
8888 type_register_static(&nvme_bus_info);
8889 }
8890
8891 type_init(nvme_register_types)
8892