1 /* 2 * QEMU NVM Express Controller 3 * 4 * Copyright (c) 2012, Intel Corporation 5 * 6 * Written by Keith Busch <keith.busch@intel.com> 7 * 8 * This code is licensed under the GNU GPL v2 or later. 9 */ 10 11 /** 12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e 13 * 14 * https://nvmexpress.org/developers/nvme-specification/ 15 * 16 * 17 * Notes on coding style 18 * --------------------- 19 * While QEMU coding style prefers lowercase hexadecimals in constants, the 20 * NVMe subsystem use this format from the NVMe specifications in the comments 21 * (i.e. 'h' suffix instead of '0x' prefix). 22 * 23 * Usage 24 * ----- 25 * See docs/system/devices/nvme.rst for extensive documentation. 26 * 27 * Add options: 28 * -drive file=<file>,if=none,id=<drive_id> 29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id> 30 * -device nvme,serial=<serial>,id=<bus_name>, \ 31 * cmb_size_mb=<cmb_size_mb[optional]>, \ 32 * [pmrdev=<mem_backend_file_id>,] \ 33 * max_ioqpairs=<N[optional]>, \ 34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \ 35 * mdts=<N[optional]>,vsl=<N[optional]>, \ 36 * zoned.zasl=<N[optional]>, \ 37 * zoned.auto_transition=<on|off[optional]>, \ 38 * sriov_max_vfs=<N[optional]> \ 39 * sriov_vq_flexible=<N[optional]> \ 40 * sriov_vi_flexible=<N[optional]> \ 41 * sriov_max_vi_per_vf=<N[optional]> \ 42 * sriov_max_vq_per_vf=<N[optional]> \ 43 * atomic.dn=<on|off[optional]>, \ 44 * atomic.awun<N[optional]>, \ 45 * atomic.awupf<N[optional]>, \ 46 * subsys=<subsys_id> 47 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\ 48 * zoned=<true|false[optional]>, \ 49 * subsys=<subsys_id>,shared=<true|false[optional]>, \ 50 * detached=<true|false[optional]>, \ 51 * zoned.zone_size=<N[optional]>, \ 52 * zoned.zone_capacity=<N[optional]>, \ 53 * zoned.descr_ext_size=<N[optional]>, \ 54 * zoned.max_active=<N[optional]>, \ 55 * zoned.max_open=<N[optional]>, \ 56 * zoned.cross_read=<true|false[optional]> 57 * 58 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at 59 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the 60 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to 61 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior). 62 * 63 * Enabling pmr emulation can be achieved by pointing to memory-backend-file. 64 * For example: 65 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \ 66 * size=<size> .... -device nvme,...,pmrdev=<mem_id> 67 * 68 * The PMR will use BAR 4/5 exclusively. 69 * 70 * To place controller(s) and namespace(s) to a subsystem, then provide 71 * nvme-subsys device as above. 72 * 73 * nvme subsystem device parameters 74 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 75 * - `nqn` 76 * This parameter provides the `<nqn_id>` part of the string 77 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field 78 * of subsystem controllers. Note that `<nqn_id>` should be unique per 79 * subsystem, but this is not enforced by QEMU. If not specified, it will 80 * default to the value of the `id` parameter (`<subsys_id>`). 81 * 82 * nvme device parameters 83 * ~~~~~~~~~~~~~~~~~~~~~~ 84 * - `subsys` 85 * Specifying this parameter attaches the controller to the subsystem and 86 * the SUBNQN field in the controller will report the NQN of the subsystem 87 * device. This also enables multi controller capability represented in 88 * Identify Controller data structure in CMIC (Controller Multi-path I/O and 89 * Namespace Sharing Capabilities). 90 * 91 * - `aerl` 92 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number 93 * of concurrently outstanding Asynchronous Event Request commands support 94 * by the controller. This is a 0's based value. 95 * 96 * - `aer_max_queued` 97 * This is the maximum number of events that the device will enqueue for 98 * completion when there are no outstanding AERs. When the maximum number of 99 * enqueued events are reached, subsequent events will be dropped. 100 * 101 * - `mdts` 102 * Indicates the maximum data transfer size for a command that transfers data 103 * between host-accessible memory and the controller. The value is specified 104 * as a power of two (2^n) and is in units of the minimum memory page size 105 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB). 106 * 107 * - `vsl` 108 * Indicates the maximum data size limit for the Verify command. Like `mdts`, 109 * this value is specified as a power of two (2^n) and is in units of the 110 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512 111 * KiB). 112 * 113 * - `zoned.zasl` 114 * Indicates the maximum data transfer size for the Zone Append command. Like 115 * `mdts`, the value is specified as a power of two (2^n) and is in units of 116 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e. 117 * defaulting to the value of `mdts`). 118 * 119 * - `zoned.auto_transition` 120 * Indicates if zones in zone state implicitly opened can be automatically 121 * transitioned to zone state closed for resource management purposes. 122 * Defaults to 'on'. 123 * 124 * - `sriov_max_vfs` 125 * Indicates the maximum number of PCIe virtual functions supported 126 * by the controller. The default value is 0. Specifying a non-zero value 127 * enables reporting of both SR-IOV and ARI capabilities by the NVMe device. 128 * Virtual function controllers will not report SR-IOV capability. 129 * 130 * NOTE: Single Root I/O Virtualization support is experimental. 131 * All the related parameters may be subject to change. 132 * 133 * - `sriov_vq_flexible` 134 * Indicates the total number of flexible queue resources assignable to all 135 * the secondary controllers. Implicitly sets the number of primary 136 * controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`. 137 * 138 * - `sriov_vi_flexible` 139 * Indicates the total number of flexible interrupt resources assignable to 140 * all the secondary controllers. Implicitly sets the number of primary 141 * controller's private resources to `(msix_qsize - sriov_vi_flexible)`. 142 * 143 * - `sriov_max_vi_per_vf` 144 * Indicates the maximum number of virtual interrupt resources assignable 145 * to a secondary controller. The default 0 resolves to 146 * `(sriov_vi_flexible / sriov_max_vfs)`. 147 * 148 * - `sriov_max_vq_per_vf` 149 * Indicates the maximum number of virtual queue resources assignable to 150 * a secondary controller. The default 0 resolves to 151 * `(sriov_vq_flexible / sriov_max_vfs)`. 152 * 153 * nvme namespace device parameters 154 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 155 * - `shared` 156 * When the parent nvme device (as defined explicitly by the 'bus' parameter 157 * or implicitly by the most recently defined NvmeBus) is linked to an 158 * nvme-subsys device, the namespace will be attached to all controllers in 159 * the subsystem. If set to 'off' (the default), the namespace will remain a 160 * private namespace and may only be attached to a single controller at a 161 * time. 162 * 163 * - `detached` 164 * This parameter is only valid together with the `subsys` parameter. If left 165 * at the default value (`false/off`), the namespace will be attached to all 166 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the 167 * namespace will be available in the subsystem but not attached to any 168 * controllers. 169 * 170 * Setting `zoned` to true selects Zoned Command Set at the namespace. 171 * In this case, the following namespace properties are available to configure 172 * zoned operation: 173 * zoned.zone_size=<zone size in bytes, default: 128MiB> 174 * The number may be followed by K, M, G as in kilo-, mega- or giga-. 175 * 176 * zoned.zone_capacity=<zone capacity in bytes, default: zone size> 177 * The value 0 (default) forces zone capacity to be the same as zone 178 * size. The value of this property may not exceed zone size. 179 * 180 * zoned.descr_ext_size=<zone descriptor extension size, default 0> 181 * This value needs to be specified in 64B units. If it is zero, 182 * namespace(s) will not support zone descriptor extensions. 183 * 184 * zoned.max_active=<Maximum Active Resources (zones), default: 0> 185 * The default value means there is no limit to the number of 186 * concurrently active zones. 187 * 188 * zoned.max_open=<Maximum Open Resources (zones), default: 0> 189 * The default value means there is no limit to the number of 190 * concurrently open zones. 191 * 192 * zoned.cross_read=<enable RAZB, default: false> 193 * Setting this property to true enables Read Across Zone Boundaries. 194 */ 195 196 #include "qemu/osdep.h" 197 #include "qemu/cutils.h" 198 #include "qemu/error-report.h" 199 #include "qemu/log.h" 200 #include "qemu/units.h" 201 #include "qemu/range.h" 202 #include "qapi/error.h" 203 #include "qapi/visitor.h" 204 #include "system/system.h" 205 #include "system/block-backend.h" 206 #include "system/hostmem.h" 207 #include "hw/pci/msix.h" 208 #include "hw/pci/pcie_sriov.h" 209 #include "system/spdm-socket.h" 210 #include "migration/vmstate.h" 211 212 #include "nvme.h" 213 #include "dif.h" 214 #include "trace.h" 215 216 #define NVME_MAX_IOQPAIRS 0xffff 217 #define NVME_DB_SIZE 4 218 #define NVME_SPEC_VER 0x00010400 219 #define NVME_CMB_BIR 2 220 #define NVME_PMR_BIR 4 221 #define NVME_TEMPERATURE 0x143 222 #define NVME_TEMPERATURE_WARNING 0x157 223 #define NVME_TEMPERATURE_CRITICAL 0x175 224 #define NVME_NUM_FW_SLOTS 1 225 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB) 226 #define NVME_VF_RES_GRANULARITY 1 227 #define NVME_VF_OFFSET 0x1 228 #define NVME_VF_STRIDE 1 229 230 #define NVME_GUEST_ERR(trace, fmt, ...) \ 231 do { \ 232 (trace_##trace)(__VA_ARGS__); \ 233 qemu_log_mask(LOG_GUEST_ERROR, #trace \ 234 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \ 235 } while (0) 236 237 static const bool nvme_feature_support[NVME_FID_MAX] = { 238 [NVME_ARBITRATION] = true, 239 [NVME_POWER_MANAGEMENT] = true, 240 [NVME_TEMPERATURE_THRESHOLD] = true, 241 [NVME_ERROR_RECOVERY] = true, 242 [NVME_VOLATILE_WRITE_CACHE] = true, 243 [NVME_NUMBER_OF_QUEUES] = true, 244 [NVME_INTERRUPT_COALESCING] = true, 245 [NVME_INTERRUPT_VECTOR_CONF] = true, 246 [NVME_WRITE_ATOMICITY] = true, 247 [NVME_ASYNCHRONOUS_EVENT_CONF] = true, 248 [NVME_TIMESTAMP] = true, 249 [NVME_HOST_BEHAVIOR_SUPPORT] = true, 250 [NVME_COMMAND_SET_PROFILE] = true, 251 [NVME_FDP_MODE] = true, 252 [NVME_FDP_EVENTS] = true, 253 }; 254 255 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = { 256 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE, 257 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS, 258 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE, 259 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE, 260 [NVME_WRITE_ATOMICITY] = NVME_FEAT_CAP_CHANGE, 261 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE, 262 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE, 263 [NVME_HOST_BEHAVIOR_SUPPORT] = NVME_FEAT_CAP_CHANGE, 264 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE, 265 [NVME_FDP_MODE] = NVME_FEAT_CAP_CHANGE, 266 [NVME_FDP_EVENTS] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS, 267 }; 268 269 static const uint32_t nvme_cse_acs_default[256] = { 270 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP, 271 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP, 272 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP, 273 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP, 274 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP, 275 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP, 276 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP, 277 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP, 278 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP, 279 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, 280 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC | 281 NVME_CMD_EFF_CCC, 282 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 283 [NVME_ADM_CMD_DIRECTIVE_RECV] = NVME_CMD_EFF_CSUPP, 284 [NVME_ADM_CMD_DIRECTIVE_SEND] = NVME_CMD_EFF_CSUPP, 285 [NVME_ADM_CMD_SECURITY_SEND] = NVME_CMD_EFF_CSUPP, 286 [NVME_ADM_CMD_SECURITY_RECV] = NVME_CMD_EFF_CSUPP, 287 }; 288 289 static const uint32_t nvme_cse_iocs_nvm_default[256] = { 290 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 291 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 292 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 293 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, 294 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 295 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP, 296 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 297 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, 298 [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP, 299 [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 300 }; 301 302 static const uint32_t nvme_cse_iocs_zoned_default[256] = { 303 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 304 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 305 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 306 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, 307 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 308 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP, 309 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 310 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, 311 [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP, 312 [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 313 314 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 315 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 316 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP, 317 }; 318 319 static void nvme_process_sq(void *opaque); 320 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst); 321 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n); 322 323 static uint16_t nvme_sqid(NvmeRequest *req) 324 { 325 return le16_to_cpu(req->sq->sqid); 326 } 327 328 static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg, 329 uint16_t ph) 330 { 331 uint16_t rgif = ns->endgrp->fdp.rgif; 332 333 if (!rgif) { 334 return ph; 335 } 336 337 return (rg << (16 - rgif)) | ph; 338 } 339 340 static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph) 341 { 342 return ph < ns->fdp.nphs; 343 } 344 345 static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg) 346 { 347 return rg < endgrp->fdp.nrg; 348 } 349 350 static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid) 351 { 352 uint16_t rgif = ns->endgrp->fdp.rgif; 353 354 if (!rgif) { 355 return pid; 356 } 357 358 return pid & ((1 << (15 - rgif)) - 1); 359 } 360 361 static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid) 362 { 363 uint16_t rgif = ns->endgrp->fdp.rgif; 364 365 if (!rgif) { 366 return 0; 367 } 368 369 return pid >> (16 - rgif); 370 } 371 372 static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid, 373 uint16_t *ph, uint16_t *rg) 374 { 375 *rg = nvme_pid2rg(ns, pid); 376 *ph = nvme_pid2ph(ns, pid); 377 378 return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg); 379 } 380 381 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone, 382 NvmeZoneState state) 383 { 384 if (QTAILQ_IN_USE(zone, entry)) { 385 switch (nvme_get_zone_state(zone)) { 386 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 387 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry); 388 break; 389 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 390 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); 391 break; 392 case NVME_ZONE_STATE_CLOSED: 393 QTAILQ_REMOVE(&ns->closed_zones, zone, entry); 394 break; 395 case NVME_ZONE_STATE_FULL: 396 QTAILQ_REMOVE(&ns->full_zones, zone, entry); 397 default: 398 ; 399 } 400 } 401 402 nvme_set_zone_state(zone, state); 403 404 switch (state) { 405 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 406 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry); 407 break; 408 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 409 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry); 410 break; 411 case NVME_ZONE_STATE_CLOSED: 412 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry); 413 break; 414 case NVME_ZONE_STATE_FULL: 415 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry); 416 case NVME_ZONE_STATE_READ_ONLY: 417 break; 418 default: 419 zone->d.za = 0; 420 } 421 } 422 423 static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act, 424 uint32_t opn, uint32_t zrwa) 425 { 426 if (ns->params.max_active_zones != 0 && 427 ns->nr_active_zones + act > ns->params.max_active_zones) { 428 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones); 429 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR; 430 } 431 432 if (ns->params.max_open_zones != 0 && 433 ns->nr_open_zones + opn > ns->params.max_open_zones) { 434 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones); 435 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR; 436 } 437 438 if (zrwa > ns->zns.numzrwa) { 439 return NVME_NOZRWA | NVME_DNR; 440 } 441 442 return NVME_SUCCESS; 443 } 444 445 /* 446 * Check if we can open a zone without exceeding open/active limits. 447 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5). 448 */ 449 static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn) 450 { 451 return nvme_zns_check_resources(ns, act, opn, 0); 452 } 453 454 static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf) 455 { 456 NvmeFdpEvent *ret = NULL; 457 bool is_full = ebuf->next == ebuf->start && ebuf->nelems; 458 459 ret = &ebuf->events[ebuf->next++]; 460 if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) { 461 ebuf->next = 0; 462 } 463 if (is_full) { 464 ebuf->start = ebuf->next; 465 } else { 466 ebuf->nelems++; 467 } 468 469 memset(ret, 0, sizeof(NvmeFdpEvent)); 470 ret->timestamp = nvme_get_timestamp(n); 471 472 return ret; 473 } 474 475 static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type) 476 { 477 return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1; 478 } 479 480 static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid) 481 { 482 NvmeEnduranceGroup *endgrp = ns->endgrp; 483 NvmeRuHandle *ruh; 484 NvmeReclaimUnit *ru; 485 NvmeFdpEvent *e = NULL; 486 uint16_t ph, rg, ruhid; 487 488 if (!nvme_parse_pid(ns, pid, &ph, &rg)) { 489 return false; 490 } 491 492 ruhid = ns->fdp.phs[ph]; 493 494 ruh = &endgrp->fdp.ruhs[ruhid]; 495 ru = &ruh->rus[rg]; 496 497 if (ru->ruamw) { 498 if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) { 499 e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events); 500 e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN; 501 e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV; 502 e->pid = cpu_to_le16(pid); 503 e->nsid = cpu_to_le32(ns->params.nsid); 504 e->rgid = cpu_to_le16(rg); 505 e->ruhid = cpu_to_le16(ruhid); 506 } 507 508 /* log (eventual) GC overhead of prematurely swapping the RU */ 509 nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw)); 510 } 511 512 ru->ruamw = ruh->ruamw; 513 514 return true; 515 } 516 517 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) 518 { 519 hwaddr hi, lo; 520 521 if (!n->cmb.cmse) { 522 return false; 523 } 524 525 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba; 526 hi = lo + int128_get64(n->cmb.mem.size); 527 528 return addr >= lo && addr < hi; 529 } 530 531 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) 532 { 533 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba; 534 return &n->cmb.buf[addr - base]; 535 } 536 537 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr) 538 { 539 hwaddr hi; 540 541 if (!n->pmr.cmse) { 542 return false; 543 } 544 545 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size); 546 547 return addr >= n->pmr.cba && addr < hi; 548 } 549 550 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr) 551 { 552 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba); 553 } 554 555 static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr) 556 { 557 hwaddr hi, lo; 558 559 /* 560 * The purpose of this check is to guard against invalid "local" access to 561 * the iomem (i.e. controller registers). Thus, we check against the range 562 * covered by the 'bar0' MemoryRegion since that is currently composed of 563 * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however, 564 * that if the device model is ever changed to allow the CMB to be located 565 * in BAR0 as well, then this must be changed. 566 */ 567 lo = n->bar0.addr; 568 hi = lo + int128_get64(n->bar0.size); 569 570 return addr >= lo && addr < hi; 571 } 572 573 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) 574 { 575 hwaddr hi = addr + size - 1; 576 if (hi < addr) { 577 return 1; 578 } 579 580 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) { 581 memcpy(buf, nvme_addr_to_cmb(n, addr), size); 582 return 0; 583 } 584 585 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) { 586 memcpy(buf, nvme_addr_to_pmr(n, addr), size); 587 return 0; 588 } 589 590 return pci_dma_read(PCI_DEVICE(n), addr, buf, size); 591 } 592 593 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size) 594 { 595 hwaddr hi = addr + size - 1; 596 if (hi < addr) { 597 return 1; 598 } 599 600 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) { 601 memcpy(nvme_addr_to_cmb(n, addr), buf, size); 602 return 0; 603 } 604 605 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) { 606 memcpy(nvme_addr_to_pmr(n, addr), buf, size); 607 return 0; 608 } 609 610 return pci_dma_write(PCI_DEVICE(n), addr, buf, size); 611 } 612 613 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid) 614 { 615 return nsid && 616 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES); 617 } 618 619 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid) 620 { 621 return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1; 622 } 623 624 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid) 625 { 626 return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1; 627 } 628 629 static void nvme_inc_cq_tail(NvmeCQueue *cq) 630 { 631 cq->tail++; 632 if (cq->tail >= cq->size) { 633 cq->tail = 0; 634 cq->phase = !cq->phase; 635 } 636 } 637 638 static void nvme_inc_sq_head(NvmeSQueue *sq) 639 { 640 sq->head = (sq->head + 1) % sq->size; 641 } 642 643 static uint8_t nvme_cq_full(NvmeCQueue *cq) 644 { 645 return (cq->tail + 1) % cq->size == cq->head; 646 } 647 648 static uint8_t nvme_sq_empty(NvmeSQueue *sq) 649 { 650 return sq->head == sq->tail; 651 } 652 653 static void nvme_irq_check(NvmeCtrl *n) 654 { 655 PCIDevice *pci = PCI_DEVICE(n); 656 uint32_t intms = ldl_le_p(&n->bar.intms); 657 658 if (msix_enabled(pci)) { 659 return; 660 } 661 662 /* vfs does not implement intx */ 663 if (pci_is_vf(pci)) { 664 return; 665 } 666 667 if (~intms & n->irq_status) { 668 pci_irq_assert(pci); 669 } else { 670 pci_irq_deassert(pci); 671 } 672 } 673 674 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq) 675 { 676 PCIDevice *pci = PCI_DEVICE(n); 677 678 if (cq->irq_enabled) { 679 if (msix_enabled(pci)) { 680 trace_pci_nvme_irq_msix(cq->vector); 681 msix_notify(pci, cq->vector); 682 } else { 683 trace_pci_nvme_irq_pin(); 684 assert(cq->vector < 32); 685 n->irq_status |= 1 << cq->vector; 686 nvme_irq_check(n); 687 } 688 } else { 689 trace_pci_nvme_irq_masked(); 690 } 691 } 692 693 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq) 694 { 695 if (cq->irq_enabled) { 696 if (msix_enabled(PCI_DEVICE(n))) { 697 return; 698 } else { 699 assert(cq->vector < 32); 700 if (!n->cq_pending) { 701 n->irq_status &= ~(1 << cq->vector); 702 } 703 nvme_irq_check(n); 704 } 705 } 706 } 707 708 static void nvme_req_clear(NvmeRequest *req) 709 { 710 req->ns = NULL; 711 req->opaque = NULL; 712 req->aiocb = NULL; 713 memset(&req->cqe, 0x0, sizeof(req->cqe)); 714 req->status = NVME_SUCCESS; 715 } 716 717 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma) 718 { 719 if (dma) { 720 pci_dma_sglist_init(&sg->qsg, PCI_DEVICE(n), 0); 721 sg->flags = NVME_SG_DMA; 722 } else { 723 qemu_iovec_init(&sg->iov, 0); 724 } 725 726 sg->flags |= NVME_SG_ALLOC; 727 } 728 729 static inline void nvme_sg_unmap(NvmeSg *sg) 730 { 731 if (!(sg->flags & NVME_SG_ALLOC)) { 732 return; 733 } 734 735 if (sg->flags & NVME_SG_DMA) { 736 qemu_sglist_destroy(&sg->qsg); 737 } else { 738 qemu_iovec_destroy(&sg->iov); 739 } 740 741 memset(sg, 0x0, sizeof(*sg)); 742 } 743 744 /* 745 * When metadata is transferred as extended LBAs, the DPTR mapped into `sg` 746 * holds both data and metadata. This function splits the data and metadata 747 * into two separate QSG/IOVs. 748 */ 749 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data, 750 NvmeSg *mdata) 751 { 752 NvmeSg *dst = data; 753 uint32_t trans_len, count = ns->lbasz; 754 uint64_t offset = 0; 755 bool dma = sg->flags & NVME_SG_DMA; 756 size_t sge_len; 757 size_t sg_len = dma ? sg->qsg.size : sg->iov.size; 758 int sg_idx = 0; 759 760 assert(sg->flags & NVME_SG_ALLOC); 761 762 while (sg_len) { 763 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len; 764 765 trans_len = MIN(sg_len, count); 766 trans_len = MIN(trans_len, sge_len - offset); 767 768 if (dst) { 769 if (dma) { 770 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset, 771 trans_len); 772 } else { 773 qemu_iovec_add(&dst->iov, 774 sg->iov.iov[sg_idx].iov_base + offset, 775 trans_len); 776 } 777 } 778 779 sg_len -= trans_len; 780 count -= trans_len; 781 offset += trans_len; 782 783 if (count == 0) { 784 dst = (dst == data) ? mdata : data; 785 count = (dst == data) ? ns->lbasz : ns->lbaf.ms; 786 } 787 788 if (sge_len == offset) { 789 offset = 0; 790 sg_idx++; 791 } 792 } 793 } 794 795 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, 796 size_t len) 797 { 798 if (!len) { 799 return NVME_SUCCESS; 800 } 801 802 trace_pci_nvme_map_addr_cmb(addr, len); 803 804 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) { 805 return NVME_DATA_TRAS_ERROR; 806 } 807 808 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len); 809 810 return NVME_SUCCESS; 811 } 812 813 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, 814 size_t len) 815 { 816 if (!len) { 817 return NVME_SUCCESS; 818 } 819 820 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) { 821 return NVME_DATA_TRAS_ERROR; 822 } 823 824 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len); 825 826 return NVME_SUCCESS; 827 } 828 829 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len) 830 { 831 bool cmb = false, pmr = false; 832 833 if (!len) { 834 return NVME_SUCCESS; 835 } 836 837 trace_pci_nvme_map_addr(addr, len); 838 839 if (nvme_addr_is_iomem(n, addr)) { 840 return NVME_DATA_TRAS_ERROR; 841 } 842 843 if (nvme_addr_is_cmb(n, addr)) { 844 cmb = true; 845 } else if (nvme_addr_is_pmr(n, addr)) { 846 pmr = true; 847 } 848 849 if (cmb || pmr) { 850 if (sg->flags & NVME_SG_DMA) { 851 return NVME_INVALID_USE_OF_CMB | NVME_DNR; 852 } 853 854 if (sg->iov.niov + 1 > IOV_MAX) { 855 goto max_mappings_exceeded; 856 } 857 858 if (cmb) { 859 return nvme_map_addr_cmb(n, &sg->iov, addr, len); 860 } else { 861 return nvme_map_addr_pmr(n, &sg->iov, addr, len); 862 } 863 } 864 865 if (!(sg->flags & NVME_SG_DMA)) { 866 return NVME_INVALID_USE_OF_CMB | NVME_DNR; 867 } 868 869 if (sg->qsg.nsg + 1 > IOV_MAX) { 870 goto max_mappings_exceeded; 871 } 872 873 qemu_sglist_add(&sg->qsg, addr, len); 874 875 return NVME_SUCCESS; 876 877 max_mappings_exceeded: 878 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings, 879 "number of mappings exceed 1024"); 880 return NVME_INTERNAL_DEV_ERROR | NVME_DNR; 881 } 882 883 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr) 884 { 885 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr)); 886 } 887 888 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1, 889 uint64_t prp2, uint32_t len) 890 { 891 hwaddr trans_len = n->page_size - (prp1 % n->page_size); 892 trans_len = MIN(len, trans_len); 893 int num_prps = (len >> n->page_bits) + 1; 894 uint16_t status; 895 int ret; 896 897 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); 898 899 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1)); 900 901 status = nvme_map_addr(n, sg, prp1, trans_len); 902 if (status) { 903 goto unmap; 904 } 905 906 len -= trans_len; 907 if (len) { 908 if (len > n->page_size) { 909 g_autofree uint64_t *prp_list = g_new(uint64_t, n->max_prp_ents); 910 uint32_t nents, prp_trans; 911 int i = 0; 912 913 /* 914 * The first PRP list entry, pointed to by PRP2 may contain offset. 915 * Hence, we need to calculate the number of entries in based on 916 * that offset. 917 */ 918 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3; 919 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); 920 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); 921 if (ret) { 922 trace_pci_nvme_err_addr_read(prp2); 923 status = NVME_DATA_TRAS_ERROR; 924 goto unmap; 925 } 926 while (len != 0) { 927 uint64_t prp_ent = le64_to_cpu(prp_list[i]); 928 929 if (i == nents - 1 && len > n->page_size) { 930 if (unlikely(prp_ent & (n->page_size - 1))) { 931 trace_pci_nvme_err_invalid_prplist_ent(prp_ent); 932 status = NVME_INVALID_PRP_OFFSET | NVME_DNR; 933 goto unmap; 934 } 935 936 i = 0; 937 nents = (len + n->page_size - 1) >> n->page_bits; 938 nents = MIN(nents, n->max_prp_ents); 939 prp_trans = nents * sizeof(uint64_t); 940 ret = nvme_addr_read(n, prp_ent, (void *)prp_list, 941 prp_trans); 942 if (ret) { 943 trace_pci_nvme_err_addr_read(prp_ent); 944 status = NVME_DATA_TRAS_ERROR; 945 goto unmap; 946 } 947 prp_ent = le64_to_cpu(prp_list[i]); 948 } 949 950 if (unlikely(prp_ent & (n->page_size - 1))) { 951 trace_pci_nvme_err_invalid_prplist_ent(prp_ent); 952 status = NVME_INVALID_PRP_OFFSET | NVME_DNR; 953 goto unmap; 954 } 955 956 trans_len = MIN(len, n->page_size); 957 status = nvme_map_addr(n, sg, prp_ent, trans_len); 958 if (status) { 959 goto unmap; 960 } 961 962 len -= trans_len; 963 i++; 964 } 965 } else { 966 if (unlikely(prp2 & (n->page_size - 1))) { 967 trace_pci_nvme_err_invalid_prp2_align(prp2); 968 status = NVME_INVALID_PRP_OFFSET | NVME_DNR; 969 goto unmap; 970 } 971 status = nvme_map_addr(n, sg, prp2, len); 972 if (status) { 973 goto unmap; 974 } 975 } 976 } 977 978 return NVME_SUCCESS; 979 980 unmap: 981 nvme_sg_unmap(sg); 982 return status; 983 } 984 985 /* 986 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the 987 * number of bytes mapped in len. 988 */ 989 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg, 990 NvmeSglDescriptor *segment, uint64_t nsgld, 991 size_t *len, NvmeCmd *cmd) 992 { 993 dma_addr_t addr, trans_len; 994 uint32_t dlen; 995 uint16_t status; 996 997 for (int i = 0; i < nsgld; i++) { 998 uint8_t type = NVME_SGL_TYPE(segment[i].type); 999 1000 switch (type) { 1001 case NVME_SGL_DESCR_TYPE_DATA_BLOCK: 1002 break; 1003 case NVME_SGL_DESCR_TYPE_SEGMENT: 1004 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: 1005 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR; 1006 default: 1007 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR; 1008 } 1009 1010 dlen = le32_to_cpu(segment[i].len); 1011 1012 if (!dlen) { 1013 continue; 1014 } 1015 1016 if (*len == 0) { 1017 /* 1018 * All data has been mapped, but the SGL contains additional 1019 * segments and/or descriptors. The controller might accept 1020 * ignoring the rest of the SGL. 1021 */ 1022 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls); 1023 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) { 1024 break; 1025 } 1026 1027 trace_pci_nvme_err_invalid_sgl_excess_length(dlen); 1028 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 1029 } 1030 1031 trans_len = MIN(*len, dlen); 1032 1033 addr = le64_to_cpu(segment[i].addr); 1034 1035 if (UINT64_MAX - addr < dlen) { 1036 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 1037 } 1038 1039 status = nvme_map_addr(n, sg, addr, trans_len); 1040 if (status) { 1041 return status; 1042 } 1043 1044 *len -= trans_len; 1045 } 1046 1047 return NVME_SUCCESS; 1048 } 1049 1050 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, 1051 size_t len, NvmeCmd *cmd) 1052 { 1053 /* 1054 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid 1055 * dynamically allocating a potentially huge SGL. The spec allows the SGL 1056 * to be larger (as in number of bytes required to describe the SGL 1057 * descriptors and segment chain) than the command transfer size, so it is 1058 * not bounded by MDTS. 1059 */ 1060 #define SEG_CHUNK_SIZE 256 1061 1062 QEMU_UNINITIALIZED NvmeSglDescriptor segment[SEG_CHUNK_SIZE]; 1063 NvmeSglDescriptor *sgld, *last_sgld; 1064 uint64_t nsgld; 1065 uint32_t seg_len; 1066 uint16_t status; 1067 hwaddr addr; 1068 int ret; 1069 1070 sgld = &sgl; 1071 addr = le64_to_cpu(sgl.addr); 1072 1073 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len); 1074 1075 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr)); 1076 1077 /* 1078 * If the entire transfer can be described with a single data block it can 1079 * be mapped directly. 1080 */ 1081 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { 1082 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd); 1083 if (status) { 1084 goto unmap; 1085 } 1086 1087 goto out; 1088 } 1089 1090 for (;;) { 1091 switch (NVME_SGL_TYPE(sgld->type)) { 1092 case NVME_SGL_DESCR_TYPE_SEGMENT: 1093 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: 1094 break; 1095 default: 1096 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; 1097 } 1098 1099 seg_len = le32_to_cpu(sgld->len); 1100 1101 /* check the length of the (Last) Segment descriptor */ 1102 if (!seg_len || seg_len & 0xf) { 1103 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; 1104 } 1105 1106 if (UINT64_MAX - addr < seg_len) { 1107 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 1108 } 1109 1110 nsgld = seg_len / sizeof(NvmeSglDescriptor); 1111 1112 while (nsgld > SEG_CHUNK_SIZE) { 1113 if (nvme_addr_read(n, addr, segment, sizeof(segment))) { 1114 trace_pci_nvme_err_addr_read(addr); 1115 status = NVME_DATA_TRAS_ERROR; 1116 goto unmap; 1117 } 1118 1119 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE, 1120 &len, cmd); 1121 if (status) { 1122 goto unmap; 1123 } 1124 1125 nsgld -= SEG_CHUNK_SIZE; 1126 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor); 1127 } 1128 1129 ret = nvme_addr_read(n, addr, segment, nsgld * 1130 sizeof(NvmeSglDescriptor)); 1131 if (ret) { 1132 trace_pci_nvme_err_addr_read(addr); 1133 status = NVME_DATA_TRAS_ERROR; 1134 goto unmap; 1135 } 1136 1137 last_sgld = &segment[nsgld - 1]; 1138 1139 /* 1140 * If the segment ends with a Data Block, then we are done. 1141 */ 1142 if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { 1143 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd); 1144 if (status) { 1145 goto unmap; 1146 } 1147 1148 goto out; 1149 } 1150 1151 /* 1152 * If the last descriptor was not a Data Block, then the current 1153 * segment must not be a Last Segment. 1154 */ 1155 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) { 1156 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; 1157 goto unmap; 1158 } 1159 1160 sgld = last_sgld; 1161 addr = le64_to_cpu(sgld->addr); 1162 1163 /* 1164 * Do not map the last descriptor; it will be a Segment or Last Segment 1165 * descriptor and is handled by the next iteration. 1166 */ 1167 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd); 1168 if (status) { 1169 goto unmap; 1170 } 1171 } 1172 1173 out: 1174 /* if there is any residual left in len, the SGL was too short */ 1175 if (len) { 1176 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 1177 goto unmap; 1178 } 1179 1180 return NVME_SUCCESS; 1181 1182 unmap: 1183 nvme_sg_unmap(sg); 1184 return status; 1185 } 1186 1187 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len, 1188 NvmeCmd *cmd) 1189 { 1190 uint64_t prp1, prp2; 1191 1192 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) { 1193 case NVME_PSDT_PRP: 1194 prp1 = le64_to_cpu(cmd->dptr.prp1); 1195 prp2 = le64_to_cpu(cmd->dptr.prp2); 1196 1197 return nvme_map_prp(n, sg, prp1, prp2, len); 1198 case NVME_PSDT_SGL_MPTR_CONTIGUOUS: 1199 case NVME_PSDT_SGL_MPTR_SGL: 1200 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd); 1201 default: 1202 return NVME_INVALID_FIELD; 1203 } 1204 } 1205 1206 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len, 1207 NvmeCmd *cmd) 1208 { 1209 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags); 1210 hwaddr mptr = le64_to_cpu(cmd->mptr); 1211 uint16_t status; 1212 1213 if (psdt == NVME_PSDT_SGL_MPTR_SGL) { 1214 NvmeSglDescriptor sgl; 1215 1216 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) { 1217 return NVME_DATA_TRAS_ERROR; 1218 } 1219 1220 status = nvme_map_sgl(n, sg, sgl, len, cmd); 1221 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) { 1222 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR; 1223 } 1224 1225 return status; 1226 } 1227 1228 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr)); 1229 status = nvme_map_addr(n, sg, mptr, len); 1230 if (status) { 1231 nvme_sg_unmap(sg); 1232 } 1233 1234 return status; 1235 } 1236 1237 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req) 1238 { 1239 NvmeNamespace *ns = req->ns; 1240 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1241 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps); 1242 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT); 1243 size_t len = nvme_l2b(ns, nlb); 1244 uint16_t status; 1245 1246 if (nvme_ns_ext(ns) && 1247 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) { 1248 NvmeSg sg; 1249 1250 len += nvme_m2b(ns, nlb); 1251 1252 status = nvme_map_dptr(n, &sg, len, &req->cmd); 1253 if (status) { 1254 return status; 1255 } 1256 1257 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA); 1258 nvme_sg_split(&sg, ns, &req->sg, NULL); 1259 nvme_sg_unmap(&sg); 1260 1261 return NVME_SUCCESS; 1262 } 1263 1264 return nvme_map_dptr(n, &req->sg, len, &req->cmd); 1265 } 1266 1267 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req) 1268 { 1269 NvmeNamespace *ns = req->ns; 1270 size_t len = nvme_m2b(ns, nlb); 1271 uint16_t status; 1272 1273 if (nvme_ns_ext(ns)) { 1274 NvmeSg sg; 1275 1276 len += nvme_l2b(ns, nlb); 1277 1278 status = nvme_map_dptr(n, &sg, len, &req->cmd); 1279 if (status) { 1280 return status; 1281 } 1282 1283 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA); 1284 nvme_sg_split(&sg, ns, NULL, &req->sg); 1285 nvme_sg_unmap(&sg); 1286 1287 return NVME_SUCCESS; 1288 } 1289 1290 return nvme_map_mptr(n, &req->sg, len, &req->cmd); 1291 } 1292 1293 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, 1294 uint32_t len, uint32_t bytes, 1295 int32_t skip_bytes, int64_t offset, 1296 NvmeTxDirection dir) 1297 { 1298 hwaddr addr; 1299 uint32_t trans_len, count = bytes; 1300 bool dma = sg->flags & NVME_SG_DMA; 1301 int64_t sge_len; 1302 int sg_idx = 0; 1303 int ret; 1304 1305 assert(sg->flags & NVME_SG_ALLOC); 1306 1307 while (len) { 1308 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len; 1309 1310 if (sge_len - offset < 0) { 1311 offset -= sge_len; 1312 sg_idx++; 1313 continue; 1314 } 1315 1316 if (sge_len == offset) { 1317 offset = 0; 1318 sg_idx++; 1319 continue; 1320 } 1321 1322 trans_len = MIN(len, count); 1323 trans_len = MIN(trans_len, sge_len - offset); 1324 1325 if (dma) { 1326 addr = sg->qsg.sg[sg_idx].base + offset; 1327 } else { 1328 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset; 1329 } 1330 1331 if (dir == NVME_TX_DIRECTION_TO_DEVICE) { 1332 ret = nvme_addr_read(n, addr, ptr, trans_len); 1333 } else { 1334 ret = nvme_addr_write(n, addr, ptr, trans_len); 1335 } 1336 1337 if (ret) { 1338 return NVME_DATA_TRAS_ERROR; 1339 } 1340 1341 ptr += trans_len; 1342 len -= trans_len; 1343 count -= trans_len; 1344 offset += trans_len; 1345 1346 if (count == 0) { 1347 count = bytes; 1348 offset += skip_bytes; 1349 } 1350 } 1351 1352 return NVME_SUCCESS; 1353 } 1354 1355 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len, 1356 NvmeTxDirection dir) 1357 { 1358 assert(sg->flags & NVME_SG_ALLOC); 1359 1360 if (sg->flags & NVME_SG_DMA) { 1361 const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED; 1362 dma_addr_t residual; 1363 1364 if (dir == NVME_TX_DIRECTION_TO_DEVICE) { 1365 dma_buf_write(ptr, len, &residual, &sg->qsg, attrs); 1366 } else { 1367 dma_buf_read(ptr, len, &residual, &sg->qsg, attrs); 1368 } 1369 1370 if (unlikely(residual)) { 1371 trace_pci_nvme_err_invalid_dma(); 1372 return NVME_INVALID_FIELD | NVME_DNR; 1373 } 1374 } else { 1375 size_t bytes; 1376 1377 if (dir == NVME_TX_DIRECTION_TO_DEVICE) { 1378 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len); 1379 } else { 1380 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len); 1381 } 1382 1383 if (unlikely(bytes != len)) { 1384 trace_pci_nvme_err_invalid_dma(); 1385 return NVME_INVALID_FIELD | NVME_DNR; 1386 } 1387 } 1388 1389 return NVME_SUCCESS; 1390 } 1391 1392 static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len, 1393 NvmeRequest *req) 1394 { 1395 uint16_t status; 1396 1397 status = nvme_map_dptr(n, &req->sg, len, &req->cmd); 1398 if (status) { 1399 return status; 1400 } 1401 1402 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE); 1403 } 1404 1405 static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len, 1406 NvmeRequest *req) 1407 { 1408 uint16_t status; 1409 1410 status = nvme_map_dptr(n, &req->sg, len, &req->cmd); 1411 if (status) { 1412 return status; 1413 } 1414 1415 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE); 1416 } 1417 1418 uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len, 1419 NvmeTxDirection dir, NvmeRequest *req) 1420 { 1421 NvmeNamespace *ns = req->ns; 1422 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1423 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps); 1424 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT); 1425 1426 if (nvme_ns_ext(ns) && 1427 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) { 1428 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz, 1429 ns->lbaf.ms, 0, dir); 1430 } 1431 1432 return nvme_tx(n, &req->sg, ptr, len, dir); 1433 } 1434 1435 uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len, 1436 NvmeTxDirection dir, NvmeRequest *req) 1437 { 1438 NvmeNamespace *ns = req->ns; 1439 uint16_t status; 1440 1441 if (nvme_ns_ext(ns)) { 1442 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms, 1443 ns->lbasz, ns->lbasz, dir); 1444 } 1445 1446 nvme_sg_unmap(&req->sg); 1447 1448 status = nvme_map_mptr(n, &req->sg, len, &req->cmd); 1449 if (status) { 1450 return status; 1451 } 1452 1453 return nvme_tx(n, &req->sg, ptr, len, dir); 1454 } 1455 1456 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset, 1457 uint32_t align, BlockCompletionFunc *cb, 1458 NvmeRequest *req) 1459 { 1460 assert(req->sg.flags & NVME_SG_ALLOC); 1461 1462 if (req->sg.flags & NVME_SG_DMA) { 1463 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req); 1464 } else { 1465 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req); 1466 } 1467 } 1468 1469 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset, 1470 uint32_t align, BlockCompletionFunc *cb, 1471 NvmeRequest *req) 1472 { 1473 assert(req->sg.flags & NVME_SG_ALLOC); 1474 1475 if (req->sg.flags & NVME_SG_DMA) { 1476 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req); 1477 } else { 1478 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req); 1479 } 1480 } 1481 1482 static void nvme_update_cq_eventidx(const NvmeCQueue *cq) 1483 { 1484 trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head); 1485 1486 stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head, 1487 MEMTXATTRS_UNSPECIFIED); 1488 } 1489 1490 static void nvme_update_cq_head(NvmeCQueue *cq) 1491 { 1492 ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, &cq->head, 1493 MEMTXATTRS_UNSPECIFIED); 1494 1495 trace_pci_nvme_update_cq_head(cq->cqid, cq->head); 1496 } 1497 1498 static void nvme_post_cqes(void *opaque) 1499 { 1500 NvmeCQueue *cq = opaque; 1501 NvmeCtrl *n = cq->ctrl; 1502 NvmeRequest *req, *next; 1503 bool pending = cq->head != cq->tail; 1504 int ret; 1505 1506 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) { 1507 NvmeSQueue *sq; 1508 hwaddr addr; 1509 1510 if (n->dbbuf_enabled) { 1511 nvme_update_cq_eventidx(cq); 1512 nvme_update_cq_head(cq); 1513 } 1514 1515 if (nvme_cq_full(cq)) { 1516 break; 1517 } 1518 1519 sq = req->sq; 1520 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase); 1521 req->cqe.sq_id = cpu_to_le16(sq->sqid); 1522 req->cqe.sq_head = cpu_to_le16(sq->head); 1523 addr = cq->dma_addr + (cq->tail << NVME_CQES); 1524 ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)&req->cqe, 1525 sizeof(req->cqe)); 1526 if (ret) { 1527 trace_pci_nvme_err_addr_write(addr); 1528 trace_pci_nvme_err_cfs(); 1529 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED); 1530 break; 1531 } 1532 1533 QTAILQ_REMOVE(&cq->req_list, req, entry); 1534 1535 nvme_inc_cq_tail(cq); 1536 nvme_sg_unmap(&req->sg); 1537 1538 if (QTAILQ_EMPTY(&sq->req_list) && !nvme_sq_empty(sq)) { 1539 qemu_bh_schedule(sq->bh); 1540 } 1541 1542 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); 1543 } 1544 if (cq->tail != cq->head) { 1545 if (cq->irq_enabled && !pending) { 1546 n->cq_pending++; 1547 } 1548 1549 nvme_irq_assert(n, cq); 1550 } 1551 } 1552 1553 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) 1554 { 1555 assert(cq->cqid == req->sq->cqid); 1556 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid, 1557 le32_to_cpu(req->cqe.result), 1558 le32_to_cpu(req->cqe.dw1), 1559 req->status); 1560 1561 if (req->status) { 1562 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns), 1563 req->status, req->cmd.opcode); 1564 } 1565 1566 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); 1567 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); 1568 1569 qemu_bh_schedule(cq->bh); 1570 } 1571 1572 static void nvme_process_aers(void *opaque) 1573 { 1574 NvmeCtrl *n = opaque; 1575 NvmeAsyncEvent *event, *next; 1576 1577 trace_pci_nvme_process_aers(n->aer_queued); 1578 1579 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) { 1580 NvmeRequest *req; 1581 NvmeAerResult *result; 1582 1583 /* can't post cqe if there is nothing to complete */ 1584 if (!n->outstanding_aers) { 1585 trace_pci_nvme_no_outstanding_aers(); 1586 break; 1587 } 1588 1589 /* ignore if masked (cqe posted, but event not cleared) */ 1590 if (n->aer_mask & (1 << event->result.event_type)) { 1591 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask); 1592 continue; 1593 } 1594 1595 QTAILQ_REMOVE(&n->aer_queue, event, entry); 1596 n->aer_queued--; 1597 1598 n->aer_mask |= 1 << event->result.event_type; 1599 n->outstanding_aers--; 1600 1601 req = n->aer_reqs[n->outstanding_aers]; 1602 1603 result = (NvmeAerResult *) &req->cqe.result; 1604 result->event_type = event->result.event_type; 1605 result->event_info = event->result.event_info; 1606 result->log_page = event->result.log_page; 1607 g_free(event); 1608 1609 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info, 1610 result->log_page); 1611 1612 nvme_enqueue_req_completion(&n->admin_cq, req); 1613 } 1614 } 1615 1616 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type, 1617 uint8_t event_info, uint8_t log_page) 1618 { 1619 NvmeAsyncEvent *event; 1620 1621 trace_pci_nvme_enqueue_event(event_type, event_info, log_page); 1622 1623 if (n->aer_queued == n->params.aer_max_queued) { 1624 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued); 1625 return; 1626 } 1627 1628 event = g_new(NvmeAsyncEvent, 1); 1629 event->result = (NvmeAerResult) { 1630 .event_type = event_type, 1631 .event_info = event_info, 1632 .log_page = log_page, 1633 }; 1634 1635 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry); 1636 n->aer_queued++; 1637 1638 nvme_process_aers(n); 1639 } 1640 1641 static void nvme_smart_event(NvmeCtrl *n, uint8_t event) 1642 { 1643 uint8_t aer_info; 1644 1645 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */ 1646 if (!(NVME_AEC_SMART(n->features.async_config) & event)) { 1647 return; 1648 } 1649 1650 switch (event) { 1651 case NVME_SMART_SPARE: 1652 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH; 1653 break; 1654 case NVME_SMART_TEMPERATURE: 1655 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH; 1656 break; 1657 case NVME_SMART_RELIABILITY: 1658 case NVME_SMART_MEDIA_READ_ONLY: 1659 case NVME_SMART_FAILED_VOLATILE_MEDIA: 1660 case NVME_SMART_PMR_UNRELIABLE: 1661 aer_info = NVME_AER_INFO_SMART_RELIABILITY; 1662 break; 1663 default: 1664 return; 1665 } 1666 1667 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO); 1668 } 1669 1670 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type) 1671 { 1672 NvmeAsyncEvent *event, *next; 1673 1674 n->aer_mask &= ~(1 << event_type); 1675 1676 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) { 1677 if (event->result.event_type == event_type) { 1678 QTAILQ_REMOVE(&n->aer_queue, event, entry); 1679 n->aer_queued--; 1680 g_free(event); 1681 } 1682 } 1683 } 1684 1685 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len) 1686 { 1687 uint8_t mdts = n->params.mdts; 1688 1689 if (mdts && len > n->page_size << mdts) { 1690 trace_pci_nvme_err_mdts(len); 1691 return NVME_INVALID_FIELD | NVME_DNR; 1692 } 1693 1694 return NVME_SUCCESS; 1695 } 1696 1697 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba, 1698 uint32_t nlb) 1699 { 1700 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze); 1701 1702 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) { 1703 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze); 1704 return NVME_LBA_RANGE | NVME_DNR; 1705 } 1706 1707 return NVME_SUCCESS; 1708 } 1709 1710 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba, 1711 uint32_t nlb, int flags) 1712 { 1713 BlockDriverState *bs = blk_bs(ns->blkconf.blk); 1714 1715 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb); 1716 int64_t offset = nvme_l2b(ns, slba); 1717 int ret; 1718 1719 /* 1720 * `pnum` holds the number of bytes after offset that shares the same 1721 * allocation status as the byte at offset. If `pnum` is different from 1722 * `bytes`, we should check the allocation status of the next range and 1723 * continue this until all bytes have been checked. 1724 */ 1725 do { 1726 bytes -= pnum; 1727 1728 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL); 1729 if (ret < 0) { 1730 return ret; 1731 } 1732 1733 1734 trace_pci_nvme_block_status(offset, bytes, pnum, ret, 1735 !!(ret & BDRV_BLOCK_ZERO)); 1736 1737 if (!(ret & flags)) { 1738 return 1; 1739 } 1740 1741 offset += pnum; 1742 } while (pnum != bytes); 1743 1744 return 0; 1745 } 1746 1747 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba, 1748 uint32_t nlb) 1749 { 1750 int ret; 1751 Error *err = NULL; 1752 1753 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA); 1754 if (ret) { 1755 if (ret < 0) { 1756 error_setg_errno(&err, -ret, "unable to get block status"); 1757 error_report_err(err); 1758 1759 return NVME_INTERNAL_DEV_ERROR; 1760 } 1761 1762 return NVME_DULB; 1763 } 1764 1765 return NVME_SUCCESS; 1766 } 1767 1768 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba) 1769 { 1770 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 : 1771 slba / ns->zone_size; 1772 } 1773 1774 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba) 1775 { 1776 uint32_t zone_idx = nvme_zone_idx(ns, slba); 1777 1778 if (zone_idx >= ns->num_zones) { 1779 return NULL; 1780 } 1781 1782 return &ns->zone_array[zone_idx]; 1783 } 1784 1785 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone) 1786 { 1787 uint64_t zslba = zone->d.zslba; 1788 1789 switch (nvme_get_zone_state(zone)) { 1790 case NVME_ZONE_STATE_EMPTY: 1791 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1792 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1793 case NVME_ZONE_STATE_CLOSED: 1794 return NVME_SUCCESS; 1795 case NVME_ZONE_STATE_FULL: 1796 trace_pci_nvme_err_zone_is_full(zslba); 1797 return NVME_ZONE_FULL; 1798 case NVME_ZONE_STATE_OFFLINE: 1799 trace_pci_nvme_err_zone_is_offline(zslba); 1800 return NVME_ZONE_OFFLINE; 1801 case NVME_ZONE_STATE_READ_ONLY: 1802 trace_pci_nvme_err_zone_is_read_only(zslba); 1803 return NVME_ZONE_READ_ONLY; 1804 default: 1805 g_assert_not_reached(); 1806 } 1807 1808 return NVME_INTERNAL_DEV_ERROR; 1809 } 1810 1811 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone, 1812 uint64_t slba, uint32_t nlb) 1813 { 1814 uint64_t zcap = nvme_zone_wr_boundary(zone); 1815 uint16_t status; 1816 1817 status = nvme_check_zone_state_for_write(zone); 1818 if (status) { 1819 return status; 1820 } 1821 1822 if (zone->d.za & NVME_ZA_ZRWA_VALID) { 1823 uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas; 1824 1825 if (slba < zone->w_ptr || slba + nlb > ezrwa) { 1826 trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr); 1827 return NVME_ZONE_INVALID_WRITE; 1828 } 1829 } else { 1830 if (unlikely(slba != zone->w_ptr)) { 1831 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, 1832 zone->w_ptr); 1833 return NVME_ZONE_INVALID_WRITE; 1834 } 1835 } 1836 1837 if (unlikely((slba + nlb) > zcap)) { 1838 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap); 1839 return NVME_ZONE_BOUNDARY_ERROR; 1840 } 1841 1842 return NVME_SUCCESS; 1843 } 1844 1845 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) 1846 { 1847 switch (nvme_get_zone_state(zone)) { 1848 case NVME_ZONE_STATE_EMPTY: 1849 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1850 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1851 case NVME_ZONE_STATE_FULL: 1852 case NVME_ZONE_STATE_CLOSED: 1853 case NVME_ZONE_STATE_READ_ONLY: 1854 return NVME_SUCCESS; 1855 case NVME_ZONE_STATE_OFFLINE: 1856 trace_pci_nvme_err_zone_is_offline(zone->d.zslba); 1857 return NVME_ZONE_OFFLINE; 1858 default: 1859 g_assert_not_reached(); 1860 } 1861 1862 return NVME_INTERNAL_DEV_ERROR; 1863 } 1864 1865 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, 1866 uint32_t nlb) 1867 { 1868 NvmeZone *zone; 1869 uint64_t bndry, end; 1870 uint16_t status; 1871 1872 zone = nvme_get_zone_by_slba(ns, slba); 1873 assert(zone); 1874 1875 bndry = nvme_zone_rd_boundary(ns, zone); 1876 end = slba + nlb; 1877 1878 status = nvme_check_zone_state_for_read(zone); 1879 if (status) { 1880 ; 1881 } else if (unlikely(end > bndry)) { 1882 if (!ns->params.cross_zone_read) { 1883 status = NVME_ZONE_BOUNDARY_ERROR; 1884 } else { 1885 /* 1886 * Read across zone boundary - check that all subsequent 1887 * zones that are being read have an appropriate state. 1888 */ 1889 do { 1890 zone++; 1891 status = nvme_check_zone_state_for_read(zone); 1892 if (status) { 1893 break; 1894 } 1895 } while (end > nvme_zone_rd_boundary(ns, zone)); 1896 } 1897 } 1898 1899 return status; 1900 } 1901 1902 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone) 1903 { 1904 switch (nvme_get_zone_state(zone)) { 1905 case NVME_ZONE_STATE_FULL: 1906 return NVME_SUCCESS; 1907 1908 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1909 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1910 nvme_aor_dec_open(ns); 1911 /* fallthrough */ 1912 case NVME_ZONE_STATE_CLOSED: 1913 nvme_aor_dec_active(ns); 1914 1915 if (zone->d.za & NVME_ZA_ZRWA_VALID) { 1916 zone->d.za &= ~NVME_ZA_ZRWA_VALID; 1917 if (ns->params.numzrwa) { 1918 ns->zns.numzrwa++; 1919 } 1920 } 1921 1922 /* fallthrough */ 1923 case NVME_ZONE_STATE_EMPTY: 1924 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); 1925 return NVME_SUCCESS; 1926 1927 default: 1928 return NVME_ZONE_INVAL_TRANSITION; 1929 } 1930 } 1931 1932 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone) 1933 { 1934 switch (nvme_get_zone_state(zone)) { 1935 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1936 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1937 nvme_aor_dec_open(ns); 1938 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); 1939 /* fall through */ 1940 case NVME_ZONE_STATE_CLOSED: 1941 return NVME_SUCCESS; 1942 1943 default: 1944 return NVME_ZONE_INVAL_TRANSITION; 1945 } 1946 } 1947 1948 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone) 1949 { 1950 switch (nvme_get_zone_state(zone)) { 1951 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1952 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1953 nvme_aor_dec_open(ns); 1954 /* fallthrough */ 1955 case NVME_ZONE_STATE_CLOSED: 1956 nvme_aor_dec_active(ns); 1957 1958 if (zone->d.za & NVME_ZA_ZRWA_VALID) { 1959 if (ns->params.numzrwa) { 1960 ns->zns.numzrwa++; 1961 } 1962 } 1963 1964 /* fallthrough */ 1965 case NVME_ZONE_STATE_FULL: 1966 zone->w_ptr = zone->d.zslba; 1967 zone->d.wp = zone->w_ptr; 1968 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY); 1969 /* fallthrough */ 1970 case NVME_ZONE_STATE_EMPTY: 1971 return NVME_SUCCESS; 1972 1973 default: 1974 return NVME_ZONE_INVAL_TRANSITION; 1975 } 1976 } 1977 1978 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns) 1979 { 1980 NvmeZone *zone; 1981 1982 if (ns->params.max_open_zones && 1983 ns->nr_open_zones == ns->params.max_open_zones) { 1984 zone = QTAILQ_FIRST(&ns->imp_open_zones); 1985 if (zone) { 1986 /* 1987 * Automatically close this implicitly open zone. 1988 */ 1989 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); 1990 nvme_zrm_close(ns, zone); 1991 } 1992 } 1993 } 1994 1995 enum { 1996 NVME_ZRM_AUTO = 1 << 0, 1997 NVME_ZRM_ZRWA = 1 << 1, 1998 }; 1999 2000 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns, 2001 NvmeZone *zone, int flags) 2002 { 2003 int act = 0; 2004 uint16_t status; 2005 2006 switch (nvme_get_zone_state(zone)) { 2007 case NVME_ZONE_STATE_EMPTY: 2008 act = 1; 2009 2010 /* fallthrough */ 2011 2012 case NVME_ZONE_STATE_CLOSED: 2013 if (n->params.auto_transition_zones) { 2014 nvme_zrm_auto_transition_zone(ns); 2015 } 2016 status = nvme_zns_check_resources(ns, act, 1, 2017 (flags & NVME_ZRM_ZRWA) ? 1 : 0); 2018 if (status) { 2019 return status; 2020 } 2021 2022 if (act) { 2023 nvme_aor_inc_active(ns); 2024 } 2025 2026 nvme_aor_inc_open(ns); 2027 2028 if (flags & NVME_ZRM_AUTO) { 2029 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN); 2030 return NVME_SUCCESS; 2031 } 2032 2033 /* fallthrough */ 2034 2035 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 2036 if (flags & NVME_ZRM_AUTO) { 2037 return NVME_SUCCESS; 2038 } 2039 2040 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN); 2041 2042 /* fallthrough */ 2043 2044 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 2045 if (flags & NVME_ZRM_ZRWA) { 2046 ns->zns.numzrwa--; 2047 2048 zone->d.za |= NVME_ZA_ZRWA_VALID; 2049 } 2050 2051 return NVME_SUCCESS; 2052 2053 default: 2054 return NVME_ZONE_INVAL_TRANSITION; 2055 } 2056 } 2057 2058 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns, 2059 NvmeZone *zone) 2060 { 2061 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO); 2062 } 2063 2064 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, 2065 uint32_t nlb) 2066 { 2067 zone->d.wp += nlb; 2068 2069 if (zone->d.wp == nvme_zone_wr_boundary(zone)) { 2070 nvme_zrm_finish(ns, zone); 2071 } 2072 } 2073 2074 static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone, 2075 uint32_t nlbc) 2076 { 2077 uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg); 2078 2079 nlbc = nzrwafgs * ns->zns.zrwafg; 2080 2081 trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc); 2082 2083 zone->w_ptr += nlbc; 2084 2085 nvme_advance_zone_wp(ns, zone, nlbc); 2086 } 2087 2088 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req) 2089 { 2090 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2091 NvmeZone *zone; 2092 uint64_t slba; 2093 uint32_t nlb; 2094 2095 slba = le64_to_cpu(rw->slba); 2096 nlb = le16_to_cpu(rw->nlb) + 1; 2097 zone = nvme_get_zone_by_slba(ns, slba); 2098 assert(zone); 2099 2100 if (zone->d.za & NVME_ZA_ZRWA_VALID) { 2101 uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1; 2102 uint64_t elba = slba + nlb - 1; 2103 2104 if (elba > ezrwa) { 2105 nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa); 2106 } 2107 2108 return; 2109 } 2110 2111 nvme_advance_zone_wp(ns, zone, nlb); 2112 } 2113 2114 static inline bool nvme_is_write(NvmeRequest *req) 2115 { 2116 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2117 2118 return rw->opcode == NVME_CMD_WRITE || 2119 rw->opcode == NVME_CMD_ZONE_APPEND || 2120 rw->opcode == NVME_CMD_WRITE_ZEROES; 2121 } 2122 2123 static void nvme_misc_cb(void *opaque, int ret) 2124 { 2125 NvmeRequest *req = opaque; 2126 uint16_t cid = nvme_cid(req); 2127 2128 trace_pci_nvme_misc_cb(cid); 2129 2130 if (ret) { 2131 if (!req->status) { 2132 req->status = NVME_INTERNAL_DEV_ERROR; 2133 } 2134 2135 trace_pci_nvme_err_aio(cid, strerror(-ret), req->status); 2136 } 2137 2138 nvme_enqueue_req_completion(nvme_cq(req), req); 2139 } 2140 2141 void nvme_rw_complete_cb(void *opaque, int ret) 2142 { 2143 NvmeRequest *req = opaque; 2144 NvmeNamespace *ns = req->ns; 2145 BlockBackend *blk = ns->blkconf.blk; 2146 BlockAcctCookie *acct = &req->acct; 2147 BlockAcctStats *stats = blk_get_stats(blk); 2148 2149 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk)); 2150 2151 if (ret) { 2152 Error *err = NULL; 2153 2154 block_acct_failed(stats, acct); 2155 2156 switch (req->cmd.opcode) { 2157 case NVME_CMD_READ: 2158 req->status = NVME_UNRECOVERED_READ; 2159 break; 2160 2161 case NVME_CMD_WRITE: 2162 case NVME_CMD_WRITE_ZEROES: 2163 case NVME_CMD_ZONE_APPEND: 2164 req->status = NVME_WRITE_FAULT; 2165 break; 2166 2167 default: 2168 req->status = NVME_INTERNAL_DEV_ERROR; 2169 break; 2170 } 2171 2172 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status); 2173 2174 error_setg_errno(&err, -ret, "aio failed"); 2175 error_report_err(err); 2176 } else { 2177 block_acct_done(stats, acct); 2178 } 2179 2180 if (ns->params.zoned && nvme_is_write(req)) { 2181 nvme_finalize_zoned_write(ns, req); 2182 } 2183 2184 nvme_enqueue_req_completion(nvme_cq(req), req); 2185 } 2186 2187 static void nvme_rw_cb(void *opaque, int ret) 2188 { 2189 NvmeRequest *req = opaque; 2190 NvmeNamespace *ns = req->ns; 2191 2192 BlockBackend *blk = ns->blkconf.blk; 2193 2194 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk)); 2195 2196 if (ret) { 2197 goto out; 2198 } 2199 2200 if (ns->lbaf.ms) { 2201 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2202 uint64_t slba = le64_to_cpu(rw->slba); 2203 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; 2204 uint64_t offset = nvme_moff(ns, slba); 2205 2206 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) { 2207 size_t mlen = nvme_m2b(ns, nlb); 2208 2209 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen, 2210 BDRV_REQ_MAY_UNMAP, 2211 nvme_rw_complete_cb, req); 2212 return; 2213 } 2214 2215 if (nvme_ns_ext(ns) || req->cmd.mptr) { 2216 uint16_t status; 2217 2218 nvme_sg_unmap(&req->sg); 2219 status = nvme_map_mdata(nvme_ctrl(req), nlb, req); 2220 if (status) { 2221 ret = -EFAULT; 2222 goto out; 2223 } 2224 2225 if (req->cmd.opcode == NVME_CMD_READ) { 2226 return nvme_blk_read(blk, offset, 1, nvme_rw_complete_cb, req); 2227 } 2228 2229 return nvme_blk_write(blk, offset, 1, nvme_rw_complete_cb, req); 2230 } 2231 } 2232 2233 out: 2234 nvme_rw_complete_cb(req, ret); 2235 } 2236 2237 static void nvme_verify_cb(void *opaque, int ret) 2238 { 2239 NvmeBounceContext *ctx = opaque; 2240 NvmeRequest *req = ctx->req; 2241 NvmeNamespace *ns = req->ns; 2242 BlockBackend *blk = ns->blkconf.blk; 2243 BlockAcctCookie *acct = &req->acct; 2244 BlockAcctStats *stats = blk_get_stats(blk); 2245 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2246 uint64_t slba = le64_to_cpu(rw->slba); 2247 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); 2248 uint16_t apptag = le16_to_cpu(rw->apptag); 2249 uint16_t appmask = le16_to_cpu(rw->appmask); 2250 uint64_t reftag = le32_to_cpu(rw->reftag); 2251 uint64_t cdw3 = le32_to_cpu(rw->cdw3); 2252 uint16_t status; 2253 2254 reftag |= cdw3 << 32; 2255 2256 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag); 2257 2258 if (ret) { 2259 block_acct_failed(stats, acct); 2260 req->status = NVME_UNRECOVERED_READ; 2261 2262 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status); 2263 2264 goto out; 2265 } 2266 2267 block_acct_done(stats, acct); 2268 2269 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2270 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce, 2271 ctx->mdata.iov.size, slba); 2272 if (status) { 2273 req->status = status; 2274 goto out; 2275 } 2276 2277 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size, 2278 ctx->mdata.bounce, ctx->mdata.iov.size, 2279 prinfo, slba, apptag, appmask, &reftag); 2280 } 2281 2282 out: 2283 qemu_iovec_destroy(&ctx->data.iov); 2284 g_free(ctx->data.bounce); 2285 2286 qemu_iovec_destroy(&ctx->mdata.iov); 2287 g_free(ctx->mdata.bounce); 2288 2289 g_free(ctx); 2290 2291 nvme_enqueue_req_completion(nvme_cq(req), req); 2292 } 2293 2294 2295 static void nvme_verify_mdata_in_cb(void *opaque, int ret) 2296 { 2297 NvmeBounceContext *ctx = opaque; 2298 NvmeRequest *req = ctx->req; 2299 NvmeNamespace *ns = req->ns; 2300 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2301 uint64_t slba = le64_to_cpu(rw->slba); 2302 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2303 size_t mlen = nvme_m2b(ns, nlb); 2304 uint64_t offset = nvme_moff(ns, slba); 2305 BlockBackend *blk = ns->blkconf.blk; 2306 2307 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk)); 2308 2309 if (ret) { 2310 goto out; 2311 } 2312 2313 ctx->mdata.bounce = g_malloc(mlen); 2314 2315 qemu_iovec_reset(&ctx->mdata.iov); 2316 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen); 2317 2318 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0, 2319 nvme_verify_cb, ctx); 2320 return; 2321 2322 out: 2323 nvme_verify_cb(ctx, ret); 2324 } 2325 2326 struct nvme_compare_ctx { 2327 struct { 2328 QEMUIOVector iov; 2329 uint8_t *bounce; 2330 } data; 2331 2332 struct { 2333 QEMUIOVector iov; 2334 uint8_t *bounce; 2335 } mdata; 2336 }; 2337 2338 static void nvme_compare_mdata_cb(void *opaque, int ret) 2339 { 2340 NvmeRequest *req = opaque; 2341 NvmeNamespace *ns = req->ns; 2342 NvmeCtrl *n = nvme_ctrl(req); 2343 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2344 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); 2345 uint16_t apptag = le16_to_cpu(rw->apptag); 2346 uint16_t appmask = le16_to_cpu(rw->appmask); 2347 uint64_t reftag = le32_to_cpu(rw->reftag); 2348 uint64_t cdw3 = le32_to_cpu(rw->cdw3); 2349 struct nvme_compare_ctx *ctx = req->opaque; 2350 g_autofree uint8_t *buf = NULL; 2351 BlockBackend *blk = ns->blkconf.blk; 2352 BlockAcctCookie *acct = &req->acct; 2353 BlockAcctStats *stats = blk_get_stats(blk); 2354 uint16_t status = NVME_SUCCESS; 2355 2356 reftag |= cdw3 << 32; 2357 2358 trace_pci_nvme_compare_mdata_cb(nvme_cid(req)); 2359 2360 if (ret) { 2361 block_acct_failed(stats, acct); 2362 req->status = NVME_UNRECOVERED_READ; 2363 2364 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status); 2365 2366 goto out; 2367 } 2368 2369 buf = g_malloc(ctx->mdata.iov.size); 2370 2371 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size, 2372 NVME_TX_DIRECTION_TO_DEVICE, req); 2373 if (status) { 2374 req->status = status; 2375 goto out; 2376 } 2377 2378 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2379 uint64_t slba = le64_to_cpu(rw->slba); 2380 uint8_t *bufp; 2381 uint8_t *mbufp = ctx->mdata.bounce; 2382 uint8_t *end = mbufp + ctx->mdata.iov.size; 2383 int16_t pil = 0; 2384 2385 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size, 2386 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo, 2387 slba, apptag, appmask, &reftag); 2388 if (status) { 2389 req->status = status; 2390 goto out; 2391 } 2392 2393 /* 2394 * When formatted with protection information, do not compare the DIF 2395 * tuple. 2396 */ 2397 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) { 2398 pil = ns->lbaf.ms - nvme_pi_tuple_size(ns); 2399 } 2400 2401 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) { 2402 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) { 2403 req->status = NVME_CMP_FAILURE | NVME_DNR; 2404 goto out; 2405 } 2406 } 2407 2408 goto out; 2409 } 2410 2411 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) { 2412 req->status = NVME_CMP_FAILURE | NVME_DNR; 2413 goto out; 2414 } 2415 2416 block_acct_done(stats, acct); 2417 2418 out: 2419 qemu_iovec_destroy(&ctx->data.iov); 2420 g_free(ctx->data.bounce); 2421 2422 qemu_iovec_destroy(&ctx->mdata.iov); 2423 g_free(ctx->mdata.bounce); 2424 2425 g_free(ctx); 2426 2427 nvme_enqueue_req_completion(nvme_cq(req), req); 2428 } 2429 2430 static void nvme_compare_data_cb(void *opaque, int ret) 2431 { 2432 NvmeRequest *req = opaque; 2433 NvmeCtrl *n = nvme_ctrl(req); 2434 NvmeNamespace *ns = req->ns; 2435 BlockBackend *blk = ns->blkconf.blk; 2436 BlockAcctCookie *acct = &req->acct; 2437 BlockAcctStats *stats = blk_get_stats(blk); 2438 2439 struct nvme_compare_ctx *ctx = req->opaque; 2440 g_autofree uint8_t *buf = NULL; 2441 uint16_t status; 2442 2443 trace_pci_nvme_compare_data_cb(nvme_cid(req)); 2444 2445 if (ret) { 2446 block_acct_failed(stats, acct); 2447 req->status = NVME_UNRECOVERED_READ; 2448 2449 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status); 2450 2451 goto out; 2452 } 2453 2454 buf = g_malloc(ctx->data.iov.size); 2455 2456 status = nvme_bounce_data(n, buf, ctx->data.iov.size, 2457 NVME_TX_DIRECTION_TO_DEVICE, req); 2458 if (status) { 2459 req->status = status; 2460 goto out; 2461 } 2462 2463 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) { 2464 req->status = NVME_CMP_FAILURE | NVME_DNR; 2465 goto out; 2466 } 2467 2468 if (ns->lbaf.ms) { 2469 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2470 uint64_t slba = le64_to_cpu(rw->slba); 2471 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2472 size_t mlen = nvme_m2b(ns, nlb); 2473 uint64_t offset = nvme_moff(ns, slba); 2474 2475 ctx->mdata.bounce = g_malloc(mlen); 2476 2477 qemu_iovec_init(&ctx->mdata.iov, 1); 2478 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen); 2479 2480 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0, 2481 nvme_compare_mdata_cb, req); 2482 return; 2483 } 2484 2485 block_acct_done(stats, acct); 2486 2487 out: 2488 qemu_iovec_destroy(&ctx->data.iov); 2489 g_free(ctx->data.bounce); 2490 g_free(ctx); 2491 2492 nvme_enqueue_req_completion(nvme_cq(req), req); 2493 } 2494 2495 typedef struct NvmeDSMAIOCB { 2496 BlockAIOCB common; 2497 BlockAIOCB *aiocb; 2498 NvmeRequest *req; 2499 int ret; 2500 2501 NvmeDsmRange *range; 2502 unsigned int nr; 2503 unsigned int idx; 2504 } NvmeDSMAIOCB; 2505 2506 static void nvme_dsm_cancel(BlockAIOCB *aiocb) 2507 { 2508 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common); 2509 2510 /* break nvme_dsm_cb loop */ 2511 iocb->idx = iocb->nr; 2512 iocb->ret = -ECANCELED; 2513 2514 if (iocb->aiocb) { 2515 blk_aio_cancel_async(iocb->aiocb); 2516 iocb->aiocb = NULL; 2517 } else { 2518 /* 2519 * We only reach this if nvme_dsm_cancel() has already been called or 2520 * the command ran to completion. 2521 */ 2522 assert(iocb->idx == iocb->nr); 2523 } 2524 } 2525 2526 static const AIOCBInfo nvme_dsm_aiocb_info = { 2527 .aiocb_size = sizeof(NvmeDSMAIOCB), 2528 .cancel_async = nvme_dsm_cancel, 2529 }; 2530 2531 static void nvme_dsm_cb(void *opaque, int ret); 2532 2533 static void nvme_dsm_md_cb(void *opaque, int ret) 2534 { 2535 NvmeDSMAIOCB *iocb = opaque; 2536 NvmeRequest *req = iocb->req; 2537 NvmeNamespace *ns = req->ns; 2538 NvmeDsmRange *range; 2539 uint64_t slba; 2540 uint32_t nlb; 2541 2542 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) { 2543 goto done; 2544 } 2545 2546 range = &iocb->range[iocb->idx - 1]; 2547 slba = le64_to_cpu(range->slba); 2548 nlb = le32_to_cpu(range->nlb); 2549 2550 /* 2551 * Check that all block were discarded (zeroed); otherwise we do not zero 2552 * the metadata. 2553 */ 2554 2555 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO); 2556 if (ret) { 2557 if (ret < 0) { 2558 goto done; 2559 } 2560 2561 nvme_dsm_cb(iocb, 0); 2562 return; 2563 } 2564 2565 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba), 2566 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP, 2567 nvme_dsm_cb, iocb); 2568 return; 2569 2570 done: 2571 nvme_dsm_cb(iocb, ret); 2572 } 2573 2574 static void nvme_dsm_cb(void *opaque, int ret) 2575 { 2576 NvmeDSMAIOCB *iocb = opaque; 2577 NvmeRequest *req = iocb->req; 2578 NvmeCtrl *n = nvme_ctrl(req); 2579 NvmeNamespace *ns = req->ns; 2580 NvmeDsmRange *range; 2581 uint64_t slba; 2582 uint32_t nlb; 2583 2584 if (iocb->ret < 0) { 2585 goto done; 2586 } else if (ret < 0) { 2587 iocb->ret = ret; 2588 goto done; 2589 } 2590 2591 next: 2592 if (iocb->idx == iocb->nr) { 2593 goto done; 2594 } 2595 2596 range = &iocb->range[iocb->idx++]; 2597 slba = le64_to_cpu(range->slba); 2598 nlb = le32_to_cpu(range->nlb); 2599 2600 trace_pci_nvme_dsm_deallocate(slba, nlb); 2601 2602 if (nlb > n->dmrsl) { 2603 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl); 2604 goto next; 2605 } 2606 2607 if (nvme_check_bounds(ns, slba, nlb)) { 2608 trace_pci_nvme_err_invalid_lba_range(slba, nlb, 2609 ns->id_ns.nsze); 2610 goto next; 2611 } 2612 2613 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba), 2614 nvme_l2b(ns, nlb), 2615 nvme_dsm_md_cb, iocb); 2616 return; 2617 2618 done: 2619 iocb->aiocb = NULL; 2620 iocb->common.cb(iocb->common.opaque, iocb->ret); 2621 g_free(iocb->range); 2622 qemu_aio_unref(iocb); 2623 } 2624 2625 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) 2626 { 2627 NvmeNamespace *ns = req->ns; 2628 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; 2629 uint32_t attr = le32_to_cpu(dsm->attributes); 2630 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; 2631 uint16_t status = NVME_SUCCESS; 2632 2633 trace_pci_nvme_dsm(nr, attr); 2634 2635 if (attr & NVME_DSMGMT_AD) { 2636 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk, 2637 nvme_misc_cb, req); 2638 2639 iocb->req = req; 2640 iocb->ret = 0; 2641 iocb->range = g_new(NvmeDsmRange, nr); 2642 iocb->nr = nr; 2643 iocb->idx = 0; 2644 2645 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr, 2646 req); 2647 if (status) { 2648 g_free(iocb->range); 2649 qemu_aio_unref(iocb); 2650 2651 return status; 2652 } 2653 2654 req->aiocb = &iocb->common; 2655 nvme_dsm_cb(iocb, 0); 2656 2657 return NVME_NO_COMPLETE; 2658 } 2659 2660 return status; 2661 } 2662 2663 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req) 2664 { 2665 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2666 NvmeNamespace *ns = req->ns; 2667 BlockBackend *blk = ns->blkconf.blk; 2668 uint64_t slba = le64_to_cpu(rw->slba); 2669 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2670 size_t len = nvme_l2b(ns, nlb); 2671 size_t data_len = len; 2672 int64_t offset = nvme_l2b(ns, slba); 2673 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); 2674 uint32_t reftag = le32_to_cpu(rw->reftag); 2675 NvmeBounceContext *ctx = NULL; 2676 uint16_t status; 2677 2678 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb); 2679 2680 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2681 status = nvme_check_prinfo(ns, prinfo, slba, reftag); 2682 if (status) { 2683 return status; 2684 } 2685 2686 if (prinfo & NVME_PRINFO_PRACT) { 2687 return NVME_INVALID_PROT_INFO | NVME_DNR; 2688 } 2689 } 2690 2691 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) { 2692 data_len += nvme_m2b(ns, nlb); 2693 } 2694 2695 if (data_len > (n->page_size << n->params.vsl)) { 2696 return NVME_INVALID_FIELD | NVME_DNR; 2697 } 2698 2699 status = nvme_check_bounds(ns, slba, nlb); 2700 if (status) { 2701 return status; 2702 } 2703 2704 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 2705 status = nvme_check_dulbe(ns, slba, nlb); 2706 if (status) { 2707 return status; 2708 } 2709 } 2710 2711 ctx = g_new0(NvmeBounceContext, 1); 2712 ctx->req = req; 2713 2714 ctx->data.bounce = g_malloc(len); 2715 2716 qemu_iovec_init(&ctx->data.iov, 1); 2717 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len); 2718 2719 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size, 2720 BLOCK_ACCT_READ); 2721 2722 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0, 2723 nvme_verify_mdata_in_cb, ctx); 2724 return NVME_NO_COMPLETE; 2725 } 2726 2727 typedef struct NvmeCopyAIOCB { 2728 BlockAIOCB common; 2729 BlockAIOCB *aiocb; 2730 NvmeRequest *req; 2731 NvmeCtrl *n; 2732 int ret; 2733 2734 void *ranges; 2735 unsigned int format; 2736 int nr; 2737 int idx; 2738 2739 uint8_t *bounce; 2740 QEMUIOVector iov; 2741 struct { 2742 BlockAcctCookie read; 2743 BlockAcctCookie write; 2744 } acct; 2745 2746 uint64_t reftag; 2747 uint64_t slba; 2748 2749 NvmeZone *zone; 2750 NvmeNamespace *sns; 2751 uint32_t tcl; 2752 } NvmeCopyAIOCB; 2753 2754 static void nvme_copy_cancel(BlockAIOCB *aiocb) 2755 { 2756 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common); 2757 2758 iocb->ret = -ECANCELED; 2759 2760 if (iocb->aiocb) { 2761 blk_aio_cancel_async(iocb->aiocb); 2762 iocb->aiocb = NULL; 2763 } 2764 } 2765 2766 static const AIOCBInfo nvme_copy_aiocb_info = { 2767 .aiocb_size = sizeof(NvmeCopyAIOCB), 2768 .cancel_async = nvme_copy_cancel, 2769 }; 2770 2771 static void nvme_copy_done(NvmeCopyAIOCB *iocb) 2772 { 2773 NvmeRequest *req = iocb->req; 2774 NvmeNamespace *ns = req->ns; 2775 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk); 2776 2777 if (iocb->idx != iocb->nr) { 2778 req->cqe.result = cpu_to_le32(iocb->idx); 2779 } 2780 2781 qemu_iovec_destroy(&iocb->iov); 2782 g_free(iocb->bounce); 2783 2784 if (iocb->ret < 0) { 2785 block_acct_failed(stats, &iocb->acct.read); 2786 block_acct_failed(stats, &iocb->acct.write); 2787 } else { 2788 block_acct_done(stats, &iocb->acct.read); 2789 block_acct_done(stats, &iocb->acct.write); 2790 } 2791 2792 iocb->common.cb(iocb->common.opaque, iocb->ret); 2793 qemu_aio_unref(iocb); 2794 } 2795 2796 static void nvme_do_copy(NvmeCopyAIOCB *iocb); 2797 2798 static void nvme_copy_source_range_parse_format0_2(void *ranges, 2799 int idx, uint64_t *slba, 2800 uint32_t *nlb, 2801 uint32_t *snsid, 2802 uint16_t *apptag, 2803 uint16_t *appmask, 2804 uint64_t *reftag) 2805 { 2806 NvmeCopySourceRangeFormat0_2 *_ranges = ranges; 2807 2808 if (snsid) { 2809 *snsid = le32_to_cpu(_ranges[idx].sparams); 2810 } 2811 2812 if (slba) { 2813 *slba = le64_to_cpu(_ranges[idx].slba); 2814 } 2815 2816 if (nlb) { 2817 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1; 2818 } 2819 2820 if (apptag) { 2821 *apptag = le16_to_cpu(_ranges[idx].apptag); 2822 } 2823 2824 if (appmask) { 2825 *appmask = le16_to_cpu(_ranges[idx].appmask); 2826 } 2827 2828 if (reftag) { 2829 *reftag = le32_to_cpu(_ranges[idx].reftag); 2830 } 2831 } 2832 2833 static void nvme_copy_source_range_parse_format1_3(void *ranges, int idx, 2834 uint64_t *slba, 2835 uint32_t *nlb, 2836 uint32_t *snsid, 2837 uint16_t *apptag, 2838 uint16_t *appmask, 2839 uint64_t *reftag) 2840 { 2841 NvmeCopySourceRangeFormat1_3 *_ranges = ranges; 2842 2843 if (snsid) { 2844 *snsid = le32_to_cpu(_ranges[idx].sparams); 2845 } 2846 2847 if (slba) { 2848 *slba = le64_to_cpu(_ranges[idx].slba); 2849 } 2850 2851 if (nlb) { 2852 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1; 2853 } 2854 2855 if (apptag) { 2856 *apptag = le16_to_cpu(_ranges[idx].apptag); 2857 } 2858 2859 if (appmask) { 2860 *appmask = le16_to_cpu(_ranges[idx].appmask); 2861 } 2862 2863 if (reftag) { 2864 *reftag = 0; 2865 2866 *reftag |= (uint64_t)_ranges[idx].sr[4] << 40; 2867 *reftag |= (uint64_t)_ranges[idx].sr[5] << 32; 2868 *reftag |= (uint64_t)_ranges[idx].sr[6] << 24; 2869 *reftag |= (uint64_t)_ranges[idx].sr[7] << 16; 2870 *reftag |= (uint64_t)_ranges[idx].sr[8] << 8; 2871 *reftag |= (uint64_t)_ranges[idx].sr[9]; 2872 } 2873 } 2874 2875 static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format, 2876 uint64_t *slba, uint32_t *nlb, 2877 uint32_t *snsid, uint16_t *apptag, 2878 uint16_t *appmask, uint64_t *reftag) 2879 { 2880 switch (format) { 2881 case NVME_COPY_FORMAT_0: 2882 case NVME_COPY_FORMAT_2: 2883 nvme_copy_source_range_parse_format0_2(ranges, idx, slba, nlb, snsid, 2884 apptag, appmask, reftag); 2885 break; 2886 2887 case NVME_COPY_FORMAT_1: 2888 case NVME_COPY_FORMAT_3: 2889 nvme_copy_source_range_parse_format1_3(ranges, idx, slba, nlb, snsid, 2890 apptag, appmask, reftag); 2891 break; 2892 2893 default: 2894 abort(); 2895 } 2896 } 2897 2898 static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns, 2899 NvmeCopyAIOCB *iocb, uint16_t nr) 2900 { 2901 uint32_t copy_len = 0; 2902 2903 for (int idx = 0; idx < nr; idx++) { 2904 uint32_t nlb; 2905 nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL, 2906 &nlb, NULL, NULL, NULL, NULL); 2907 copy_len += nlb; 2908 } 2909 iocb->tcl = copy_len; 2910 if (copy_len > ns->id_ns.mcl) { 2911 return NVME_CMD_SIZE_LIMIT | NVME_DNR; 2912 } 2913 2914 return NVME_SUCCESS; 2915 } 2916 2917 static void nvme_copy_out_completed_cb(void *opaque, int ret) 2918 { 2919 NvmeCopyAIOCB *iocb = opaque; 2920 NvmeRequest *req = iocb->req; 2921 NvmeNamespace *dns = req->ns; 2922 uint32_t nlb; 2923 2924 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL, 2925 &nlb, NULL, NULL, NULL, NULL); 2926 2927 if (ret < 0) { 2928 iocb->ret = ret; 2929 req->status = NVME_WRITE_FAULT; 2930 goto out; 2931 } else if (iocb->ret < 0) { 2932 goto out; 2933 } 2934 2935 if (dns->params.zoned) { 2936 nvme_advance_zone_wp(dns, iocb->zone, nlb); 2937 } 2938 2939 iocb->idx++; 2940 iocb->slba += nlb; 2941 out: 2942 nvme_do_copy(iocb); 2943 } 2944 2945 static void nvme_copy_out_cb(void *opaque, int ret) 2946 { 2947 NvmeCopyAIOCB *iocb = opaque; 2948 NvmeRequest *req = iocb->req; 2949 NvmeNamespace *dns = req->ns; 2950 uint32_t nlb; 2951 size_t mlen; 2952 uint8_t *mbounce; 2953 2954 if (ret < 0 || iocb->ret < 0 || !dns->lbaf.ms) { 2955 goto out; 2956 } 2957 2958 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL, 2959 &nlb, NULL, NULL, NULL, NULL); 2960 2961 mlen = nvme_m2b(dns, nlb); 2962 mbounce = iocb->bounce + nvme_l2b(dns, nlb); 2963 2964 qemu_iovec_reset(&iocb->iov); 2965 qemu_iovec_add(&iocb->iov, mbounce, mlen); 2966 2967 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_moff(dns, iocb->slba), 2968 &iocb->iov, 0, nvme_copy_out_completed_cb, 2969 iocb); 2970 2971 return; 2972 2973 out: 2974 nvme_copy_out_completed_cb(iocb, ret); 2975 } 2976 2977 static void nvme_copy_in_completed_cb(void *opaque, int ret) 2978 { 2979 NvmeCopyAIOCB *iocb = opaque; 2980 NvmeRequest *req = iocb->req; 2981 NvmeNamespace *sns = iocb->sns; 2982 NvmeNamespace *dns = req->ns; 2983 NvmeCopyCmd *copy = NULL; 2984 uint8_t *mbounce = NULL; 2985 uint32_t nlb; 2986 uint64_t slba; 2987 uint16_t apptag, appmask; 2988 uint64_t reftag; 2989 size_t len, mlen; 2990 uint16_t status; 2991 2992 if (ret < 0) { 2993 iocb->ret = ret; 2994 req->status = NVME_UNRECOVERED_READ; 2995 goto out; 2996 } else if (iocb->ret < 0) { 2997 goto out; 2998 } 2999 3000 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba, 3001 &nlb, NULL, &apptag, &appmask, &reftag); 3002 3003 trace_pci_nvme_copy_out(iocb->slba, nlb); 3004 3005 len = nvme_l2b(sns, nlb); 3006 3007 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps)) { 3008 copy = (NvmeCopyCmd *)&req->cmd; 3009 3010 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf); 3011 3012 mlen = nvme_m2b(sns, nlb); 3013 mbounce = iocb->bounce + nvme_l2b(sns, nlb); 3014 3015 status = nvme_dif_mangle_mdata(sns, mbounce, mlen, slba); 3016 if (status) { 3017 goto invalid; 3018 } 3019 status = nvme_dif_check(sns, iocb->bounce, len, mbounce, mlen, prinfor, 3020 slba, apptag, appmask, &reftag); 3021 if (status) { 3022 goto invalid; 3023 } 3024 } 3025 3026 if (NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) { 3027 copy = (NvmeCopyCmd *)&req->cmd; 3028 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf); 3029 3030 mlen = nvme_m2b(dns, nlb); 3031 mbounce = iocb->bounce + nvme_l2b(dns, nlb); 3032 3033 apptag = le16_to_cpu(copy->apptag); 3034 appmask = le16_to_cpu(copy->appmask); 3035 3036 if (prinfow & NVME_PRINFO_PRACT) { 3037 status = nvme_check_prinfo(dns, prinfow, iocb->slba, iocb->reftag); 3038 if (status) { 3039 goto invalid; 3040 } 3041 3042 nvme_dif_pract_generate_dif(dns, iocb->bounce, len, mbounce, mlen, 3043 apptag, &iocb->reftag); 3044 } else { 3045 status = nvme_dif_check(dns, iocb->bounce, len, mbounce, mlen, 3046 prinfow, iocb->slba, apptag, appmask, 3047 &iocb->reftag); 3048 if (status) { 3049 goto invalid; 3050 } 3051 } 3052 } 3053 3054 status = nvme_check_bounds(dns, iocb->slba, nlb); 3055 if (status) { 3056 goto invalid; 3057 } 3058 3059 if (dns->params.zoned) { 3060 status = nvme_check_zone_write(dns, iocb->zone, iocb->slba, nlb); 3061 if (status) { 3062 goto invalid; 3063 } 3064 3065 if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) { 3066 iocb->zone->w_ptr += nlb; 3067 } 3068 } 3069 3070 qemu_iovec_reset(&iocb->iov); 3071 qemu_iovec_add(&iocb->iov, iocb->bounce, len); 3072 3073 block_acct_start(blk_get_stats(dns->blkconf.blk), &iocb->acct.write, 0, 3074 BLOCK_ACCT_WRITE); 3075 3076 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_l2b(dns, iocb->slba), 3077 &iocb->iov, 0, nvme_copy_out_cb, iocb); 3078 3079 return; 3080 3081 invalid: 3082 req->status = status; 3083 iocb->ret = -1; 3084 out: 3085 nvme_do_copy(iocb); 3086 } 3087 3088 static void nvme_copy_in_cb(void *opaque, int ret) 3089 { 3090 NvmeCopyAIOCB *iocb = opaque; 3091 NvmeNamespace *sns = iocb->sns; 3092 uint64_t slba; 3093 uint32_t nlb; 3094 3095 if (ret < 0 || iocb->ret < 0 || !sns->lbaf.ms) { 3096 goto out; 3097 } 3098 3099 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba, 3100 &nlb, NULL, NULL, NULL, NULL); 3101 3102 qemu_iovec_reset(&iocb->iov); 3103 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(sns, nlb), 3104 nvme_m2b(sns, nlb)); 3105 3106 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_moff(sns, slba), 3107 &iocb->iov, 0, nvme_copy_in_completed_cb, 3108 iocb); 3109 return; 3110 3111 out: 3112 nvme_copy_in_completed_cb(iocb, ret); 3113 } 3114 3115 static inline bool nvme_csi_supports_copy(uint8_t csi) 3116 { 3117 return csi == NVME_CSI_NVM || csi == NVME_CSI_ZONED; 3118 } 3119 3120 static inline bool nvme_copy_ns_format_match(NvmeNamespace *sns, 3121 NvmeNamespace *dns) 3122 { 3123 return sns->lbaf.ds == dns->lbaf.ds && sns->lbaf.ms == dns->lbaf.ms; 3124 } 3125 3126 static bool nvme_copy_matching_ns_format(NvmeNamespace *sns, NvmeNamespace *dns, 3127 bool pi_enable) 3128 { 3129 if (!nvme_csi_supports_copy(sns->csi) || 3130 !nvme_csi_supports_copy(dns->csi)) { 3131 return false; 3132 } 3133 3134 if (!pi_enable && !nvme_copy_ns_format_match(sns, dns)) { 3135 return false; 3136 } 3137 3138 if (pi_enable && (!nvme_copy_ns_format_match(sns, dns) || 3139 sns->id_ns.dps != dns->id_ns.dps)) { 3140 return false; 3141 } 3142 3143 return true; 3144 } 3145 3146 static inline bool nvme_copy_corresp_pi_match(NvmeNamespace *sns, 3147 NvmeNamespace *dns) 3148 { 3149 return sns->lbaf.ms == 0 && 3150 ((dns->lbaf.ms == 8 && dns->pif == 0) || 3151 (dns->lbaf.ms == 16 && dns->pif == 1)); 3152 } 3153 3154 static bool nvme_copy_corresp_pi_format(NvmeNamespace *sns, NvmeNamespace *dns, 3155 bool sns_pi_en) 3156 { 3157 if (!nvme_csi_supports_copy(sns->csi) || 3158 !nvme_csi_supports_copy(dns->csi)) { 3159 return false; 3160 } 3161 3162 if (!sns_pi_en && !nvme_copy_corresp_pi_match(sns, dns)) { 3163 return false; 3164 } 3165 3166 if (sns_pi_en && !nvme_copy_corresp_pi_match(dns, sns)) { 3167 return false; 3168 } 3169 3170 return true; 3171 } 3172 3173 static void nvme_do_copy(NvmeCopyAIOCB *iocb) 3174 { 3175 NvmeRequest *req = iocb->req; 3176 NvmeNamespace *sns; 3177 NvmeNamespace *dns = req->ns; 3178 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; 3179 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf); 3180 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf); 3181 uint64_t slba; 3182 uint32_t nlb; 3183 size_t len; 3184 uint16_t status; 3185 uint32_t dnsid = le32_to_cpu(req->cmd.nsid); 3186 uint32_t snsid = dnsid; 3187 3188 if (iocb->ret < 0) { 3189 goto done; 3190 } 3191 3192 if (iocb->idx == iocb->nr) { 3193 goto done; 3194 } 3195 3196 if (iocb->format == 2 || iocb->format == 3) { 3197 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, 3198 &slba, &nlb, &snsid, NULL, NULL, NULL); 3199 if (snsid != dnsid) { 3200 if (snsid == NVME_NSID_BROADCAST || 3201 !nvme_nsid_valid(iocb->n, snsid)) { 3202 status = NVME_INVALID_NSID | NVME_DNR; 3203 goto invalid; 3204 } 3205 iocb->sns = nvme_ns(iocb->n, snsid); 3206 if (unlikely(!iocb->sns)) { 3207 status = NVME_INVALID_FIELD | NVME_DNR; 3208 goto invalid; 3209 } 3210 } else { 3211 if (((slba + nlb) > iocb->slba) && 3212 ((slba + nlb) < (iocb->slba + iocb->tcl))) { 3213 status = NVME_CMD_OVERLAP_IO_RANGE | NVME_DNR; 3214 goto invalid; 3215 } 3216 } 3217 } else { 3218 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, 3219 &slba, &nlb, NULL, NULL, NULL, NULL); 3220 } 3221 3222 sns = iocb->sns; 3223 if ((snsid == dnsid) && NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) && 3224 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) { 3225 status = NVME_INVALID_FIELD | NVME_DNR; 3226 goto invalid; 3227 } else if (snsid != dnsid) { 3228 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) && 3229 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) { 3230 if (!nvme_copy_matching_ns_format(sns, dns, false)) { 3231 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR; 3232 goto invalid; 3233 } 3234 } 3235 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) && 3236 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) { 3237 if ((prinfor & NVME_PRINFO_PRACT) != 3238 (prinfow & NVME_PRINFO_PRACT)) { 3239 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR; 3240 goto invalid; 3241 } else { 3242 if (!nvme_copy_matching_ns_format(sns, dns, true)) { 3243 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR; 3244 goto invalid; 3245 } 3246 } 3247 } 3248 3249 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) && 3250 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) { 3251 if (!(prinfow & NVME_PRINFO_PRACT)) { 3252 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR; 3253 goto invalid; 3254 } else { 3255 if (!nvme_copy_corresp_pi_format(sns, dns, false)) { 3256 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR; 3257 goto invalid; 3258 } 3259 } 3260 } 3261 3262 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) && 3263 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) { 3264 if (!(prinfor & NVME_PRINFO_PRACT)) { 3265 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR; 3266 goto invalid; 3267 } else { 3268 if (!nvme_copy_corresp_pi_format(sns, dns, true)) { 3269 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR; 3270 goto invalid; 3271 } 3272 } 3273 } 3274 } 3275 len = nvme_l2b(sns, nlb); 3276 3277 trace_pci_nvme_copy_source_range(slba, nlb); 3278 3279 if (nlb > le16_to_cpu(sns->id_ns.mssrl)) { 3280 status = NVME_CMD_SIZE_LIMIT | NVME_DNR; 3281 goto invalid; 3282 } 3283 3284 status = nvme_check_bounds(sns, slba, nlb); 3285 if (status) { 3286 goto invalid; 3287 } 3288 3289 if (NVME_ERR_REC_DULBE(sns->features.err_rec)) { 3290 status = nvme_check_dulbe(sns, slba, nlb); 3291 if (status) { 3292 goto invalid; 3293 } 3294 } 3295 3296 if (sns->params.zoned) { 3297 status = nvme_check_zone_read(sns, slba, nlb); 3298 if (status) { 3299 goto invalid; 3300 } 3301 } 3302 3303 g_free(iocb->bounce); 3304 iocb->bounce = g_malloc_n(le16_to_cpu(sns->id_ns.mssrl), 3305 sns->lbasz + sns->lbaf.ms); 3306 3307 qemu_iovec_reset(&iocb->iov); 3308 qemu_iovec_add(&iocb->iov, iocb->bounce, len); 3309 3310 block_acct_start(blk_get_stats(sns->blkconf.blk), &iocb->acct.read, 0, 3311 BLOCK_ACCT_READ); 3312 3313 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_l2b(sns, slba), 3314 &iocb->iov, 0, nvme_copy_in_cb, iocb); 3315 return; 3316 3317 invalid: 3318 req->status = status; 3319 iocb->ret = -1; 3320 done: 3321 nvme_copy_done(iocb); 3322 } 3323 3324 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req) 3325 { 3326 NvmeNamespace *ns = req->ns; 3327 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; 3328 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk, 3329 nvme_misc_cb, req); 3330 uint16_t nr = copy->nr + 1; 3331 uint8_t format = copy->control[0] & 0xf; 3332 size_t len = sizeof(NvmeCopySourceRangeFormat0_2); 3333 3334 uint16_t status; 3335 3336 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format); 3337 3338 iocb->ranges = NULL; 3339 iocb->zone = NULL; 3340 3341 if (!(n->id_ctrl.ocfs & (1 << format)) || 3342 ((format == 2 || format == 3) && 3343 !(n->features.hbs.cdfe & (1 << format)))) { 3344 trace_pci_nvme_err_copy_invalid_format(format); 3345 status = NVME_INVALID_FIELD | NVME_DNR; 3346 goto invalid; 3347 } 3348 3349 if (nr > ns->id_ns.msrc + 1) { 3350 status = NVME_CMD_SIZE_LIMIT | NVME_DNR; 3351 goto invalid; 3352 } 3353 3354 if ((ns->pif == 0x0 && (format != 0x0 && format != 0x2)) || 3355 (ns->pif != 0x0 && (format != 0x1 && format != 0x3))) { 3356 status = NVME_INVALID_FORMAT | NVME_DNR; 3357 goto invalid; 3358 } 3359 3360 if (ns->pif) { 3361 len = sizeof(NvmeCopySourceRangeFormat1_3); 3362 } 3363 3364 iocb->format = format; 3365 iocb->ranges = g_malloc_n(nr, len); 3366 status = nvme_h2c(n, (uint8_t *)iocb->ranges, len * nr, req); 3367 if (status) { 3368 goto invalid; 3369 } 3370 3371 iocb->slba = le64_to_cpu(copy->sdlba); 3372 3373 if (ns->params.zoned) { 3374 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba); 3375 if (!iocb->zone) { 3376 status = NVME_LBA_RANGE | NVME_DNR; 3377 goto invalid; 3378 } 3379 3380 status = nvme_zrm_auto(n, ns, iocb->zone); 3381 if (status) { 3382 goto invalid; 3383 } 3384 } 3385 3386 status = nvme_check_copy_mcl(ns, iocb, nr); 3387 if (status) { 3388 goto invalid; 3389 } 3390 3391 iocb->req = req; 3392 iocb->ret = 0; 3393 iocb->nr = nr; 3394 iocb->idx = 0; 3395 iocb->reftag = le32_to_cpu(copy->reftag); 3396 iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32; 3397 3398 qemu_iovec_init(&iocb->iov, 1); 3399 3400 req->aiocb = &iocb->common; 3401 iocb->sns = req->ns; 3402 iocb->n = n; 3403 iocb->bounce = NULL; 3404 nvme_do_copy(iocb); 3405 3406 return NVME_NO_COMPLETE; 3407 3408 invalid: 3409 g_free(iocb->ranges); 3410 qemu_aio_unref(iocb); 3411 return status; 3412 } 3413 3414 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) 3415 { 3416 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 3417 NvmeNamespace *ns = req->ns; 3418 BlockBackend *blk = ns->blkconf.blk; 3419 uint64_t slba = le64_to_cpu(rw->slba); 3420 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 3421 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); 3422 size_t data_len = nvme_l2b(ns, nlb); 3423 size_t len = data_len; 3424 int64_t offset = nvme_l2b(ns, slba); 3425 struct nvme_compare_ctx *ctx = NULL; 3426 uint16_t status; 3427 3428 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb); 3429 3430 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) { 3431 return NVME_INVALID_PROT_INFO | NVME_DNR; 3432 } 3433 3434 if (nvme_ns_ext(ns)) { 3435 len += nvme_m2b(ns, nlb); 3436 } 3437 3438 if (NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt)) { 3439 status = nvme_check_mdts(n, data_len); 3440 } else { 3441 status = nvme_check_mdts(n, len); 3442 } 3443 if (status) { 3444 return status; 3445 } 3446 3447 status = nvme_check_bounds(ns, slba, nlb); 3448 if (status) { 3449 return status; 3450 } 3451 3452 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 3453 status = nvme_check_dulbe(ns, slba, nlb); 3454 if (status) { 3455 return status; 3456 } 3457 } 3458 3459 status = nvme_map_dptr(n, &req->sg, len, &req->cmd); 3460 if (status) { 3461 return status; 3462 } 3463 3464 ctx = g_new(struct nvme_compare_ctx, 1); 3465 ctx->data.bounce = g_malloc(data_len); 3466 3467 req->opaque = ctx; 3468 3469 qemu_iovec_init(&ctx->data.iov, 1); 3470 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len); 3471 3472 block_acct_start(blk_get_stats(blk), &req->acct, data_len, 3473 BLOCK_ACCT_READ); 3474 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0, 3475 nvme_compare_data_cb, req); 3476 3477 return NVME_NO_COMPLETE; 3478 } 3479 3480 typedef struct NvmeFlushAIOCB { 3481 BlockAIOCB common; 3482 BlockAIOCB *aiocb; 3483 NvmeRequest *req; 3484 int ret; 3485 3486 NvmeNamespace *ns; 3487 uint32_t nsid; 3488 bool broadcast; 3489 } NvmeFlushAIOCB; 3490 3491 static void nvme_flush_cancel(BlockAIOCB *acb) 3492 { 3493 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common); 3494 3495 iocb->ret = -ECANCELED; 3496 3497 if (iocb->aiocb) { 3498 blk_aio_cancel_async(iocb->aiocb); 3499 iocb->aiocb = NULL; 3500 } 3501 } 3502 3503 static const AIOCBInfo nvme_flush_aiocb_info = { 3504 .aiocb_size = sizeof(NvmeFlushAIOCB), 3505 .cancel_async = nvme_flush_cancel, 3506 }; 3507 3508 static void nvme_do_flush(NvmeFlushAIOCB *iocb); 3509 3510 static void nvme_flush_ns_cb(void *opaque, int ret) 3511 { 3512 NvmeFlushAIOCB *iocb = opaque; 3513 NvmeNamespace *ns = iocb->ns; 3514 3515 if (ret < 0) { 3516 iocb->ret = ret; 3517 iocb->req->status = NVME_WRITE_FAULT; 3518 goto out; 3519 } else if (iocb->ret < 0) { 3520 goto out; 3521 } 3522 3523 if (ns) { 3524 trace_pci_nvme_flush_ns(iocb->nsid); 3525 3526 iocb->ns = NULL; 3527 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb); 3528 return; 3529 } 3530 3531 out: 3532 nvme_do_flush(iocb); 3533 } 3534 3535 static void nvme_do_flush(NvmeFlushAIOCB *iocb) 3536 { 3537 NvmeRequest *req = iocb->req; 3538 NvmeCtrl *n = nvme_ctrl(req); 3539 int i; 3540 3541 if (iocb->ret < 0) { 3542 goto done; 3543 } 3544 3545 if (iocb->broadcast) { 3546 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) { 3547 iocb->ns = nvme_ns(n, i); 3548 if (iocb->ns) { 3549 iocb->nsid = i; 3550 break; 3551 } 3552 } 3553 } 3554 3555 if (!iocb->ns) { 3556 goto done; 3557 } 3558 3559 nvme_flush_ns_cb(iocb, 0); 3560 return; 3561 3562 done: 3563 iocb->common.cb(iocb->common.opaque, iocb->ret); 3564 qemu_aio_unref(iocb); 3565 } 3566 3567 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) 3568 { 3569 NvmeFlushAIOCB *iocb; 3570 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 3571 uint16_t status; 3572 3573 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req); 3574 3575 iocb->req = req; 3576 iocb->ret = 0; 3577 iocb->ns = NULL; 3578 iocb->nsid = 0; 3579 iocb->broadcast = (nsid == NVME_NSID_BROADCAST); 3580 3581 if (!iocb->broadcast) { 3582 if (!nvme_nsid_valid(n, nsid)) { 3583 status = NVME_INVALID_NSID | NVME_DNR; 3584 goto out; 3585 } 3586 3587 iocb->ns = nvme_ns(n, nsid); 3588 if (!iocb->ns) { 3589 status = NVME_INVALID_FIELD | NVME_DNR; 3590 goto out; 3591 } 3592 3593 iocb->nsid = nsid; 3594 } 3595 3596 req->aiocb = &iocb->common; 3597 nvme_do_flush(iocb); 3598 3599 return NVME_NO_COMPLETE; 3600 3601 out: 3602 qemu_aio_unref(iocb); 3603 3604 return status; 3605 } 3606 3607 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) 3608 { 3609 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 3610 NvmeNamespace *ns = req->ns; 3611 uint64_t slba = le64_to_cpu(rw->slba); 3612 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; 3613 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); 3614 uint64_t data_size = nvme_l2b(ns, nlb); 3615 uint64_t mapped_size = data_size; 3616 uint64_t data_offset; 3617 BlockBackend *blk = ns->blkconf.blk; 3618 uint16_t status; 3619 3620 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) { 3621 mapped_size += nvme_m2b(ns, nlb); 3622 3623 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3624 bool pract = prinfo & NVME_PRINFO_PRACT; 3625 3626 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) { 3627 mapped_size = data_size; 3628 } 3629 } 3630 } 3631 3632 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba); 3633 3634 status = nvme_check_mdts(n, mapped_size); 3635 if (status) { 3636 goto invalid; 3637 } 3638 3639 status = nvme_check_bounds(ns, slba, nlb); 3640 if (status) { 3641 goto invalid; 3642 } 3643 3644 if (ns->params.zoned) { 3645 status = nvme_check_zone_read(ns, slba, nlb); 3646 if (status) { 3647 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status); 3648 goto invalid; 3649 } 3650 } 3651 3652 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 3653 status = nvme_check_dulbe(ns, slba, nlb); 3654 if (status) { 3655 goto invalid; 3656 } 3657 } 3658 3659 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3660 return nvme_dif_rw(n, req); 3661 } 3662 3663 status = nvme_map_data(n, nlb, req); 3664 if (status) { 3665 goto invalid; 3666 } 3667 3668 data_offset = nvme_l2b(ns, slba); 3669 3670 block_acct_start(blk_get_stats(blk), &req->acct, data_size, 3671 BLOCK_ACCT_READ); 3672 nvme_blk_read(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req); 3673 return NVME_NO_COMPLETE; 3674 3675 invalid: 3676 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ); 3677 return status | NVME_DNR; 3678 } 3679 3680 static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba, 3681 uint32_t nlb) 3682 { 3683 NvmeNamespace *ns = req->ns; 3684 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 3685 uint64_t data_size = nvme_l2b(ns, nlb); 3686 uint32_t dw12 = le32_to_cpu(req->cmd.cdw12); 3687 uint8_t dtype = (dw12 >> 20) & 0xf; 3688 uint16_t pid = le16_to_cpu(rw->dspec); 3689 uint16_t ph, rg, ruhid; 3690 NvmeReclaimUnit *ru; 3691 3692 if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT || 3693 !nvme_parse_pid(ns, pid, &ph, &rg)) { 3694 ph = 0; 3695 rg = 0; 3696 } 3697 3698 ruhid = ns->fdp.phs[ph]; 3699 ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg]; 3700 3701 nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size); 3702 nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size); 3703 3704 while (nlb) { 3705 if (nlb < ru->ruamw) { 3706 ru->ruamw -= nlb; 3707 break; 3708 } 3709 3710 nlb -= ru->ruamw; 3711 nvme_update_ruh(n, ns, pid); 3712 } 3713 } 3714 3715 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, 3716 bool wrz) 3717 { 3718 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 3719 NvmeNamespace *ns = req->ns; 3720 uint64_t slba = le64_to_cpu(rw->slba); 3721 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; 3722 uint16_t ctrl = le16_to_cpu(rw->control); 3723 uint8_t prinfo = NVME_RW_PRINFO(ctrl); 3724 uint64_t data_size = nvme_l2b(ns, nlb); 3725 uint64_t mapped_size = data_size; 3726 uint64_t data_offset; 3727 NvmeZone *zone; 3728 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe; 3729 BlockBackend *blk = ns->blkconf.blk; 3730 uint16_t status; 3731 3732 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) { 3733 mapped_size += nvme_m2b(ns, nlb); 3734 3735 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3736 bool pract = prinfo & NVME_PRINFO_PRACT; 3737 3738 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) { 3739 mapped_size -= nvme_m2b(ns, nlb); 3740 } 3741 } 3742 } 3743 3744 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode), 3745 nvme_nsid(ns), nlb, mapped_size, slba); 3746 3747 if (!wrz) { 3748 status = nvme_check_mdts(n, mapped_size); 3749 if (status) { 3750 goto invalid; 3751 } 3752 } 3753 3754 status = nvme_check_bounds(ns, slba, nlb); 3755 if (status) { 3756 goto invalid; 3757 } 3758 3759 if (ns->params.zoned) { 3760 zone = nvme_get_zone_by_slba(ns, slba); 3761 assert(zone); 3762 3763 if (append) { 3764 bool piremap = !!(ctrl & NVME_RW_PIREMAP); 3765 3766 if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) { 3767 return NVME_INVALID_ZONE_OP | NVME_DNR; 3768 } 3769 3770 if (unlikely(slba != zone->d.zslba)) { 3771 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba); 3772 status = NVME_INVALID_FIELD; 3773 goto invalid; 3774 } 3775 3776 if (n->params.zasl && 3777 data_size > (uint64_t)n->page_size << n->params.zasl) { 3778 trace_pci_nvme_err_zasl(data_size); 3779 return NVME_INVALID_FIELD | NVME_DNR; 3780 } 3781 3782 slba = zone->w_ptr; 3783 rw->slba = cpu_to_le64(slba); 3784 res->slba = cpu_to_le64(slba); 3785 3786 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3787 case NVME_ID_NS_DPS_TYPE_1: 3788 if (!piremap) { 3789 return NVME_INVALID_PROT_INFO | NVME_DNR; 3790 } 3791 3792 /* fallthrough */ 3793 3794 case NVME_ID_NS_DPS_TYPE_2: 3795 if (piremap) { 3796 uint32_t reftag = le32_to_cpu(rw->reftag); 3797 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba)); 3798 } 3799 3800 break; 3801 3802 case NVME_ID_NS_DPS_TYPE_3: 3803 if (piremap) { 3804 return NVME_INVALID_PROT_INFO | NVME_DNR; 3805 } 3806 3807 break; 3808 } 3809 } 3810 3811 status = nvme_check_zone_write(ns, zone, slba, nlb); 3812 if (status) { 3813 goto invalid; 3814 } 3815 3816 status = nvme_zrm_auto(n, ns, zone); 3817 if (status) { 3818 goto invalid; 3819 } 3820 3821 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) { 3822 zone->w_ptr += nlb; 3823 } 3824 } else if (ns->endgrp && ns->endgrp->fdp.enabled) { 3825 nvme_do_write_fdp(n, req, slba, nlb); 3826 } 3827 3828 data_offset = nvme_l2b(ns, slba); 3829 3830 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3831 return nvme_dif_rw(n, req); 3832 } 3833 3834 if (!wrz) { 3835 status = nvme_map_data(n, nlb, req); 3836 if (status) { 3837 goto invalid; 3838 } 3839 3840 block_acct_start(blk_get_stats(blk), &req->acct, data_size, 3841 BLOCK_ACCT_WRITE); 3842 nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req); 3843 } else { 3844 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size, 3845 BDRV_REQ_MAY_UNMAP, nvme_rw_cb, 3846 req); 3847 } 3848 3849 return NVME_NO_COMPLETE; 3850 3851 invalid: 3852 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE); 3853 return status | NVME_DNR; 3854 } 3855 3856 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req) 3857 { 3858 return nvme_do_write(n, req, false, false); 3859 } 3860 3861 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) 3862 { 3863 return nvme_do_write(n, req, false, true); 3864 } 3865 3866 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req) 3867 { 3868 return nvme_do_write(n, req, true, false); 3869 } 3870 3871 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c, 3872 uint64_t *slba, uint32_t *zone_idx) 3873 { 3874 uint32_t dw10 = le32_to_cpu(c->cdw10); 3875 uint32_t dw11 = le32_to_cpu(c->cdw11); 3876 3877 if (!ns->params.zoned) { 3878 trace_pci_nvme_err_invalid_opc(c->opcode); 3879 return NVME_INVALID_OPCODE | NVME_DNR; 3880 } 3881 3882 *slba = ((uint64_t)dw11) << 32 | dw10; 3883 if (unlikely(*slba >= ns->id_ns.nsze)) { 3884 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze); 3885 *slba = 0; 3886 return NVME_LBA_RANGE | NVME_DNR; 3887 } 3888 3889 *zone_idx = nvme_zone_idx(ns, *slba); 3890 assert(*zone_idx < ns->num_zones); 3891 3892 return NVME_SUCCESS; 3893 } 3894 3895 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState, 3896 NvmeRequest *); 3897 3898 enum NvmeZoneProcessingMask { 3899 NVME_PROC_CURRENT_ZONE = 0, 3900 NVME_PROC_OPENED_ZONES = 1 << 0, 3901 NVME_PROC_CLOSED_ZONES = 1 << 1, 3902 NVME_PROC_READ_ONLY_ZONES = 1 << 2, 3903 NVME_PROC_FULL_ZONES = 1 << 3, 3904 }; 3905 3906 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, 3907 NvmeZoneState state, NvmeRequest *req) 3908 { 3909 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd; 3910 int flags = 0; 3911 3912 if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) { 3913 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs); 3914 3915 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) { 3916 return NVME_INVALID_ZONE_OP | NVME_DNR; 3917 } 3918 3919 if (zone->w_ptr % ns->zns.zrwafg) { 3920 return NVME_NOZRWA | NVME_DNR; 3921 } 3922 3923 flags = NVME_ZRM_ZRWA; 3924 } 3925 3926 return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags); 3927 } 3928 3929 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, 3930 NvmeZoneState state, NvmeRequest *req) 3931 { 3932 return nvme_zrm_close(ns, zone); 3933 } 3934 3935 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, 3936 NvmeZoneState state, NvmeRequest *req) 3937 { 3938 return nvme_zrm_finish(ns, zone); 3939 } 3940 3941 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone, 3942 NvmeZoneState state, NvmeRequest *req) 3943 { 3944 switch (state) { 3945 case NVME_ZONE_STATE_READ_ONLY: 3946 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE); 3947 /* fall through */ 3948 case NVME_ZONE_STATE_OFFLINE: 3949 return NVME_SUCCESS; 3950 default: 3951 return NVME_ZONE_INVAL_TRANSITION; 3952 } 3953 } 3954 3955 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone) 3956 { 3957 uint16_t status; 3958 uint8_t state = nvme_get_zone_state(zone); 3959 3960 if (state == NVME_ZONE_STATE_EMPTY) { 3961 status = nvme_aor_check(ns, 1, 0); 3962 if (status) { 3963 return status; 3964 } 3965 nvme_aor_inc_active(ns); 3966 zone->d.za |= NVME_ZA_ZD_EXT_VALID; 3967 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); 3968 return NVME_SUCCESS; 3969 } 3970 3971 return NVME_ZONE_INVAL_TRANSITION; 3972 } 3973 3974 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, 3975 enum NvmeZoneProcessingMask proc_mask, 3976 op_handler_t op_hndlr, NvmeRequest *req) 3977 { 3978 uint16_t status = NVME_SUCCESS; 3979 NvmeZoneState zs = nvme_get_zone_state(zone); 3980 bool proc_zone; 3981 3982 switch (zs) { 3983 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 3984 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 3985 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES; 3986 break; 3987 case NVME_ZONE_STATE_CLOSED: 3988 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES; 3989 break; 3990 case NVME_ZONE_STATE_READ_ONLY: 3991 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES; 3992 break; 3993 case NVME_ZONE_STATE_FULL: 3994 proc_zone = proc_mask & NVME_PROC_FULL_ZONES; 3995 break; 3996 default: 3997 proc_zone = false; 3998 } 3999 4000 if (proc_zone) { 4001 status = op_hndlr(ns, zone, zs, req); 4002 } 4003 4004 return status; 4005 } 4006 4007 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone, 4008 enum NvmeZoneProcessingMask proc_mask, 4009 op_handler_t op_hndlr, NvmeRequest *req) 4010 { 4011 NvmeZone *next; 4012 uint16_t status = NVME_SUCCESS; 4013 int i; 4014 4015 if (!proc_mask) { 4016 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req); 4017 } else { 4018 if (proc_mask & NVME_PROC_CLOSED_ZONES) { 4019 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { 4020 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 4021 req); 4022 if (status && status != NVME_NO_COMPLETE) { 4023 goto out; 4024 } 4025 } 4026 } 4027 if (proc_mask & NVME_PROC_OPENED_ZONES) { 4028 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { 4029 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 4030 req); 4031 if (status && status != NVME_NO_COMPLETE) { 4032 goto out; 4033 } 4034 } 4035 4036 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { 4037 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 4038 req); 4039 if (status && status != NVME_NO_COMPLETE) { 4040 goto out; 4041 } 4042 } 4043 } 4044 if (proc_mask & NVME_PROC_FULL_ZONES) { 4045 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) { 4046 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 4047 req); 4048 if (status && status != NVME_NO_COMPLETE) { 4049 goto out; 4050 } 4051 } 4052 } 4053 4054 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) { 4055 for (i = 0; i < ns->num_zones; i++, zone++) { 4056 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 4057 req); 4058 if (status && status != NVME_NO_COMPLETE) { 4059 goto out; 4060 } 4061 } 4062 } 4063 } 4064 4065 out: 4066 return status; 4067 } 4068 4069 typedef struct NvmeZoneResetAIOCB { 4070 BlockAIOCB common; 4071 BlockAIOCB *aiocb; 4072 NvmeRequest *req; 4073 int ret; 4074 4075 bool all; 4076 int idx; 4077 NvmeZone *zone; 4078 } NvmeZoneResetAIOCB; 4079 4080 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb) 4081 { 4082 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common); 4083 NvmeRequest *req = iocb->req; 4084 NvmeNamespace *ns = req->ns; 4085 4086 iocb->idx = ns->num_zones; 4087 4088 iocb->ret = -ECANCELED; 4089 4090 if (iocb->aiocb) { 4091 blk_aio_cancel_async(iocb->aiocb); 4092 iocb->aiocb = NULL; 4093 } 4094 } 4095 4096 static const AIOCBInfo nvme_zone_reset_aiocb_info = { 4097 .aiocb_size = sizeof(NvmeZoneResetAIOCB), 4098 .cancel_async = nvme_zone_reset_cancel, 4099 }; 4100 4101 static void nvme_zone_reset_cb(void *opaque, int ret); 4102 4103 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret) 4104 { 4105 NvmeZoneResetAIOCB *iocb = opaque; 4106 NvmeRequest *req = iocb->req; 4107 NvmeNamespace *ns = req->ns; 4108 int64_t moff; 4109 int count; 4110 4111 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) { 4112 goto out; 4113 } 4114 4115 moff = nvme_moff(ns, iocb->zone->d.zslba); 4116 count = nvme_m2b(ns, ns->zone_size); 4117 4118 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count, 4119 BDRV_REQ_MAY_UNMAP, 4120 nvme_zone_reset_cb, iocb); 4121 return; 4122 4123 out: 4124 nvme_zone_reset_cb(iocb, ret); 4125 } 4126 4127 static void nvme_zone_reset_cb(void *opaque, int ret) 4128 { 4129 NvmeZoneResetAIOCB *iocb = opaque; 4130 NvmeRequest *req = iocb->req; 4131 NvmeNamespace *ns = req->ns; 4132 4133 if (iocb->ret < 0) { 4134 goto done; 4135 } else if (ret < 0) { 4136 iocb->ret = ret; 4137 goto done; 4138 } 4139 4140 if (iocb->zone) { 4141 nvme_zrm_reset(ns, iocb->zone); 4142 4143 if (!iocb->all) { 4144 goto done; 4145 } 4146 } 4147 4148 while (iocb->idx < ns->num_zones) { 4149 NvmeZone *zone = &ns->zone_array[iocb->idx++]; 4150 4151 switch (nvme_get_zone_state(zone)) { 4152 case NVME_ZONE_STATE_EMPTY: 4153 if (!iocb->all) { 4154 goto done; 4155 } 4156 4157 continue; 4158 4159 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 4160 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 4161 case NVME_ZONE_STATE_CLOSED: 4162 case NVME_ZONE_STATE_FULL: 4163 iocb->zone = zone; 4164 break; 4165 4166 default: 4167 continue; 4168 } 4169 4170 trace_pci_nvme_zns_zone_reset(zone->d.zslba); 4171 4172 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, 4173 nvme_l2b(ns, zone->d.zslba), 4174 nvme_l2b(ns, ns->zone_size), 4175 BDRV_REQ_MAY_UNMAP, 4176 nvme_zone_reset_epilogue_cb, 4177 iocb); 4178 return; 4179 } 4180 4181 done: 4182 iocb->aiocb = NULL; 4183 4184 iocb->common.cb(iocb->common.opaque, iocb->ret); 4185 qemu_aio_unref(iocb); 4186 } 4187 4188 static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone, 4189 uint64_t elba, NvmeRequest *req) 4190 { 4191 NvmeNamespace *ns = req->ns; 4192 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs); 4193 uint64_t wp = zone->d.wp; 4194 uint32_t nlb = elba - wp + 1; 4195 uint16_t status; 4196 4197 4198 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) { 4199 return NVME_INVALID_ZONE_OP | NVME_DNR; 4200 } 4201 4202 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) { 4203 return NVME_INVALID_FIELD | NVME_DNR; 4204 } 4205 4206 if (elba < wp || elba > wp + ns->zns.zrwas) { 4207 return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR; 4208 } 4209 4210 if (nlb % ns->zns.zrwafg) { 4211 return NVME_INVALID_FIELD | NVME_DNR; 4212 } 4213 4214 status = nvme_zrm_auto(n, ns, zone); 4215 if (status) { 4216 return status; 4217 } 4218 4219 zone->w_ptr += nlb; 4220 4221 nvme_advance_zone_wp(ns, zone, nlb); 4222 4223 return NVME_SUCCESS; 4224 } 4225 4226 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) 4227 { 4228 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd; 4229 NvmeNamespace *ns = req->ns; 4230 NvmeZone *zone; 4231 NvmeZoneResetAIOCB *iocb; 4232 uint8_t *zd_ext; 4233 uint64_t slba = 0; 4234 uint32_t zone_idx = 0; 4235 uint16_t status; 4236 uint8_t action = cmd->zsa; 4237 bool all; 4238 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE; 4239 4240 all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL; 4241 4242 req->status = NVME_SUCCESS; 4243 4244 if (!all) { 4245 status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx); 4246 if (status) { 4247 return status; 4248 } 4249 } 4250 4251 zone = &ns->zone_array[zone_idx]; 4252 if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) { 4253 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba); 4254 return NVME_INVALID_FIELD | NVME_DNR; 4255 } 4256 4257 switch (action) { 4258 4259 case NVME_ZONE_ACTION_OPEN: 4260 if (all) { 4261 proc_mask = NVME_PROC_CLOSED_ZONES; 4262 } 4263 trace_pci_nvme_open_zone(slba, zone_idx, all); 4264 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req); 4265 break; 4266 4267 case NVME_ZONE_ACTION_CLOSE: 4268 if (all) { 4269 proc_mask = NVME_PROC_OPENED_ZONES; 4270 } 4271 trace_pci_nvme_close_zone(slba, zone_idx, all); 4272 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req); 4273 break; 4274 4275 case NVME_ZONE_ACTION_FINISH: 4276 if (all) { 4277 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES; 4278 } 4279 trace_pci_nvme_finish_zone(slba, zone_idx, all); 4280 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req); 4281 break; 4282 4283 case NVME_ZONE_ACTION_RESET: 4284 trace_pci_nvme_reset_zone(slba, zone_idx, all); 4285 4286 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk, 4287 nvme_misc_cb, req); 4288 4289 iocb->req = req; 4290 iocb->ret = 0; 4291 iocb->all = all; 4292 iocb->idx = zone_idx; 4293 iocb->zone = NULL; 4294 4295 req->aiocb = &iocb->common; 4296 nvme_zone_reset_cb(iocb, 0); 4297 4298 return NVME_NO_COMPLETE; 4299 4300 case NVME_ZONE_ACTION_OFFLINE: 4301 if (all) { 4302 proc_mask = NVME_PROC_READ_ONLY_ZONES; 4303 } 4304 trace_pci_nvme_offline_zone(slba, zone_idx, all); 4305 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req); 4306 break; 4307 4308 case NVME_ZONE_ACTION_SET_ZD_EXT: 4309 trace_pci_nvme_set_descriptor_extension(slba, zone_idx); 4310 if (all || !ns->params.zd_extension_size) { 4311 return NVME_INVALID_FIELD | NVME_DNR; 4312 } 4313 zd_ext = nvme_get_zd_extension(ns, zone_idx); 4314 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req); 4315 if (status) { 4316 trace_pci_nvme_err_zd_extension_map_error(zone_idx); 4317 return status; 4318 } 4319 4320 status = nvme_set_zd_ext(ns, zone); 4321 if (status == NVME_SUCCESS) { 4322 trace_pci_nvme_zd_extension_set(zone_idx); 4323 return status; 4324 } 4325 break; 4326 4327 case NVME_ZONE_ACTION_ZRWA_FLUSH: 4328 if (all) { 4329 return NVME_INVALID_FIELD | NVME_DNR; 4330 } 4331 4332 return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req); 4333 4334 default: 4335 trace_pci_nvme_err_invalid_mgmt_action(action); 4336 status = NVME_INVALID_FIELD; 4337 } 4338 4339 if (status == NVME_ZONE_INVAL_TRANSITION) { 4340 trace_pci_nvme_err_invalid_zone_state_transition(action, slba, 4341 zone->d.za); 4342 } 4343 if (status) { 4344 status |= NVME_DNR; 4345 } 4346 4347 return status; 4348 } 4349 4350 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl) 4351 { 4352 NvmeZoneState zs = nvme_get_zone_state(zl); 4353 4354 switch (zafs) { 4355 case NVME_ZONE_REPORT_ALL: 4356 return true; 4357 case NVME_ZONE_REPORT_EMPTY: 4358 return zs == NVME_ZONE_STATE_EMPTY; 4359 case NVME_ZONE_REPORT_IMPLICITLY_OPEN: 4360 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN; 4361 case NVME_ZONE_REPORT_EXPLICITLY_OPEN: 4362 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN; 4363 case NVME_ZONE_REPORT_CLOSED: 4364 return zs == NVME_ZONE_STATE_CLOSED; 4365 case NVME_ZONE_REPORT_FULL: 4366 return zs == NVME_ZONE_STATE_FULL; 4367 case NVME_ZONE_REPORT_READ_ONLY: 4368 return zs == NVME_ZONE_STATE_READ_ONLY; 4369 case NVME_ZONE_REPORT_OFFLINE: 4370 return zs == NVME_ZONE_STATE_OFFLINE; 4371 default: 4372 return false; 4373 } 4374 } 4375 4376 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) 4377 { 4378 NvmeCmd *cmd = &req->cmd; 4379 NvmeNamespace *ns = req->ns; 4380 /* cdw12 is zero-based number of dwords to return. Convert to bytes */ 4381 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2; 4382 uint32_t dw13 = le32_to_cpu(cmd->cdw13); 4383 uint32_t zone_idx, zra, zrasf, partial; 4384 uint64_t max_zones, nr_zones = 0; 4385 uint16_t status; 4386 uint64_t slba; 4387 NvmeZoneDescr *z; 4388 NvmeZone *zone; 4389 NvmeZoneReportHeader *header; 4390 void *buf, *buf_p; 4391 size_t zone_entry_sz; 4392 int i; 4393 4394 req->status = NVME_SUCCESS; 4395 4396 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx); 4397 if (status) { 4398 return status; 4399 } 4400 4401 zra = dw13 & 0xff; 4402 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) { 4403 return NVME_INVALID_FIELD | NVME_DNR; 4404 } 4405 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) { 4406 return NVME_INVALID_FIELD | NVME_DNR; 4407 } 4408 4409 zrasf = (dw13 >> 8) & 0xff; 4410 if (zrasf > NVME_ZONE_REPORT_OFFLINE) { 4411 return NVME_INVALID_FIELD | NVME_DNR; 4412 } 4413 4414 if (data_size < sizeof(NvmeZoneReportHeader)) { 4415 return NVME_INVALID_FIELD | NVME_DNR; 4416 } 4417 4418 status = nvme_check_mdts(n, data_size); 4419 if (status) { 4420 return status; 4421 } 4422 4423 partial = (dw13 >> 16) & 0x01; 4424 4425 zone_entry_sz = sizeof(NvmeZoneDescr); 4426 if (zra == NVME_ZONE_REPORT_EXTENDED) { 4427 zone_entry_sz += ns->params.zd_extension_size; 4428 } 4429 4430 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz; 4431 buf = g_malloc0(data_size); 4432 4433 zone = &ns->zone_array[zone_idx]; 4434 for (i = zone_idx; i < ns->num_zones; i++) { 4435 if (partial && nr_zones >= max_zones) { 4436 break; 4437 } 4438 if (nvme_zone_matches_filter(zrasf, zone++)) { 4439 nr_zones++; 4440 } 4441 } 4442 header = buf; 4443 header->nr_zones = cpu_to_le64(nr_zones); 4444 4445 buf_p = buf + sizeof(NvmeZoneReportHeader); 4446 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) { 4447 zone = &ns->zone_array[zone_idx]; 4448 if (nvme_zone_matches_filter(zrasf, zone)) { 4449 z = buf_p; 4450 buf_p += sizeof(NvmeZoneDescr); 4451 4452 z->zt = zone->d.zt; 4453 z->zs = zone->d.zs; 4454 z->zcap = cpu_to_le64(zone->d.zcap); 4455 z->zslba = cpu_to_le64(zone->d.zslba); 4456 z->za = zone->d.za; 4457 4458 if (nvme_wp_is_valid(zone)) { 4459 z->wp = cpu_to_le64(zone->d.wp); 4460 } else { 4461 z->wp = cpu_to_le64(~0ULL); 4462 } 4463 4464 if (zra == NVME_ZONE_REPORT_EXTENDED) { 4465 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) { 4466 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx), 4467 ns->params.zd_extension_size); 4468 } 4469 buf_p += ns->params.zd_extension_size; 4470 } 4471 4472 max_zones--; 4473 } 4474 } 4475 4476 status = nvme_c2h(n, (uint8_t *)buf, data_size, req); 4477 4478 g_free(buf); 4479 4480 return status; 4481 } 4482 4483 static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req, 4484 size_t len) 4485 { 4486 NvmeNamespace *ns = req->ns; 4487 NvmeEnduranceGroup *endgrp; 4488 NvmeRuhStatus *hdr; 4489 NvmeRuhStatusDescr *ruhsd; 4490 unsigned int nruhsd; 4491 uint16_t rg, ph, *ruhid; 4492 size_t trans_len; 4493 g_autofree uint8_t *buf = NULL; 4494 4495 if (!n->subsys) { 4496 return NVME_INVALID_FIELD | NVME_DNR; 4497 } 4498 4499 if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) { 4500 return NVME_INVALID_NSID | NVME_DNR; 4501 } 4502 4503 if (!n->subsys->endgrp.fdp.enabled) { 4504 return NVME_FDP_DISABLED | NVME_DNR; 4505 } 4506 4507 endgrp = ns->endgrp; 4508 4509 nruhsd = ns->fdp.nphs * endgrp->fdp.nrg; 4510 trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr); 4511 buf = g_malloc0(trans_len); 4512 4513 trans_len = MIN(trans_len, len); 4514 4515 hdr = (NvmeRuhStatus *)buf; 4516 ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus)); 4517 4518 hdr->nruhsd = cpu_to_le16(nruhsd); 4519 4520 ruhid = ns->fdp.phs; 4521 4522 for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) { 4523 NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid]; 4524 4525 for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) { 4526 uint16_t pid = nvme_make_pid(ns, rg, ph); 4527 4528 ruhsd->pid = cpu_to_le16(pid); 4529 ruhsd->ruhid = *ruhid; 4530 ruhsd->earutr = 0; 4531 ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw); 4532 } 4533 } 4534 4535 return nvme_c2h(n, buf, trans_len, req); 4536 } 4537 4538 static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) 4539 { 4540 NvmeCmd *cmd = &req->cmd; 4541 uint32_t cdw10 = le32_to_cpu(cmd->cdw10); 4542 uint32_t numd = le32_to_cpu(cmd->cdw11); 4543 uint8_t mo = (cdw10 & 0xff); 4544 size_t len = (numd + 1) << 2; 4545 4546 switch (mo) { 4547 case NVME_IOMR_MO_NOP: 4548 return 0; 4549 case NVME_IOMR_MO_RUH_STATUS: 4550 return nvme_io_mgmt_recv_ruhs(n, req, len); 4551 default: 4552 return NVME_INVALID_FIELD | NVME_DNR; 4553 }; 4554 } 4555 4556 static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req) 4557 { 4558 NvmeCmd *cmd = &req->cmd; 4559 NvmeNamespace *ns = req->ns; 4560 uint32_t cdw10 = le32_to_cpu(cmd->cdw10); 4561 uint16_t ret = NVME_SUCCESS; 4562 uint32_t npid = (cdw10 >> 16) + 1; 4563 unsigned int i = 0; 4564 g_autofree uint16_t *pids = NULL; 4565 uint32_t maxnpid; 4566 4567 if (!ns->endgrp || !ns->endgrp->fdp.enabled) { 4568 return NVME_FDP_DISABLED | NVME_DNR; 4569 } 4570 4571 maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh; 4572 4573 if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) { 4574 return NVME_INVALID_FIELD | NVME_DNR; 4575 } 4576 4577 pids = g_new(uint16_t, npid); 4578 4579 ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req); 4580 if (ret) { 4581 return ret; 4582 } 4583 4584 for (; i < npid; i++) { 4585 if (!nvme_update_ruh(n, ns, pids[i])) { 4586 return NVME_INVALID_FIELD | NVME_DNR; 4587 } 4588 } 4589 4590 return ret; 4591 } 4592 4593 static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req) 4594 { 4595 NvmeCmd *cmd = &req->cmd; 4596 uint32_t cdw10 = le32_to_cpu(cmd->cdw10); 4597 uint8_t mo = (cdw10 & 0xff); 4598 4599 switch (mo) { 4600 case NVME_IOMS_MO_NOP: 4601 return 0; 4602 case NVME_IOMS_MO_RUH_UPDATE: 4603 return nvme_io_mgmt_send_ruh_update(n, req); 4604 default: 4605 return NVME_INVALID_FIELD | NVME_DNR; 4606 }; 4607 } 4608 4609 static uint16_t __nvme_io_cmd_nvm(NvmeCtrl *n, NvmeRequest *req) 4610 { 4611 switch (req->cmd.opcode) { 4612 case NVME_CMD_WRITE: 4613 return nvme_write(n, req); 4614 case NVME_CMD_READ: 4615 return nvme_read(n, req); 4616 case NVME_CMD_COMPARE: 4617 return nvme_compare(n, req); 4618 case NVME_CMD_WRITE_ZEROES: 4619 return nvme_write_zeroes(n, req); 4620 case NVME_CMD_DSM: 4621 return nvme_dsm(n, req); 4622 case NVME_CMD_VERIFY: 4623 return nvme_verify(n, req); 4624 case NVME_CMD_COPY: 4625 return nvme_copy(n, req); 4626 case NVME_CMD_IO_MGMT_RECV: 4627 return nvme_io_mgmt_recv(n, req); 4628 case NVME_CMD_IO_MGMT_SEND: 4629 return nvme_io_mgmt_send(n, req); 4630 } 4631 4632 g_assert_not_reached(); 4633 } 4634 4635 static uint16_t nvme_io_cmd_nvm(NvmeCtrl *n, NvmeRequest *req) 4636 { 4637 if (!(n->cse.iocs.nvm[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { 4638 trace_pci_nvme_err_invalid_opc(req->cmd.opcode); 4639 return NVME_INVALID_OPCODE | NVME_DNR; 4640 } 4641 4642 return __nvme_io_cmd_nvm(n, req); 4643 } 4644 4645 static uint16_t nvme_io_cmd_zoned(NvmeCtrl *n, NvmeRequest *req) 4646 { 4647 if (!(n->cse.iocs.zoned[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { 4648 trace_pci_nvme_err_invalid_opc(req->cmd.opcode); 4649 return NVME_INVALID_OPCODE | NVME_DNR; 4650 } 4651 4652 switch (req->cmd.opcode) { 4653 case NVME_CMD_ZONE_APPEND: 4654 return nvme_zone_append(n, req); 4655 case NVME_CMD_ZONE_MGMT_SEND: 4656 return nvme_zone_mgmt_send(n, req); 4657 case NVME_CMD_ZONE_MGMT_RECV: 4658 return nvme_zone_mgmt_recv(n, req); 4659 } 4660 4661 return __nvme_io_cmd_nvm(n, req); 4662 } 4663 4664 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) 4665 { 4666 NvmeNamespace *ns; 4667 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 4668 4669 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), 4670 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode)); 4671 4672 /* 4673 * In the base NVM command set, Flush may apply to all namespaces 4674 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used 4675 * along with TP 4056 (Namespace Types), it may be pretty screwed up. 4676 * 4677 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the 4678 * opcode with a specific command since we cannot determine a unique I/O 4679 * command set. Opcode 0h could have any other meaning than something 4680 * equivalent to flushing and say it DOES have completely different 4681 * semantics in some other command set - does an NSID of FFFFFFFFh then 4682 * mean "for all namespaces, apply whatever command set specific command 4683 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply 4684 * whatever command that uses the 0h opcode if, and only if, it allows NSID 4685 * to be FFFFFFFFh"? 4686 * 4687 * Anyway (and luckily), for now, we do not care about this since the 4688 * device only supports namespace types that includes the NVM Flush command 4689 * (NVM and Zoned), so always do an NVM Flush. 4690 */ 4691 4692 if (req->cmd.opcode == NVME_CMD_FLUSH) { 4693 return nvme_flush(n, req); 4694 } 4695 4696 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 4697 return NVME_INVALID_NSID | NVME_DNR; 4698 } 4699 4700 ns = nvme_ns(n, nsid); 4701 if (unlikely(!ns)) { 4702 return NVME_INVALID_FIELD | NVME_DNR; 4703 } 4704 4705 if (ns->status) { 4706 return ns->status; 4707 } 4708 4709 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) { 4710 return NVME_INVALID_FIELD; 4711 } 4712 4713 req->ns = ns; 4714 4715 switch (ns->csi) { 4716 case NVME_CSI_NVM: 4717 return nvme_io_cmd_nvm(n, req); 4718 case NVME_CSI_ZONED: 4719 return nvme_io_cmd_zoned(n, req); 4720 } 4721 4722 g_assert_not_reached(); 4723 } 4724 4725 static void nvme_cq_notifier(EventNotifier *e) 4726 { 4727 NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier); 4728 NvmeCtrl *n = cq->ctrl; 4729 4730 if (!event_notifier_test_and_clear(e)) { 4731 return; 4732 } 4733 4734 nvme_update_cq_head(cq); 4735 4736 if (cq->tail == cq->head) { 4737 if (cq->irq_enabled) { 4738 n->cq_pending--; 4739 } 4740 4741 nvme_irq_deassert(n, cq); 4742 } 4743 4744 qemu_bh_schedule(cq->bh); 4745 } 4746 4747 static int nvme_init_cq_ioeventfd(NvmeCQueue *cq) 4748 { 4749 NvmeCtrl *n = cq->ctrl; 4750 uint16_t offset = (cq->cqid << 3) + (1 << 2); 4751 int ret; 4752 4753 ret = event_notifier_init(&cq->notifier, 0); 4754 if (ret < 0) { 4755 return ret; 4756 } 4757 4758 event_notifier_set_handler(&cq->notifier, nvme_cq_notifier); 4759 memory_region_add_eventfd(&n->iomem, 4760 0x1000 + offset, 4, false, 0, &cq->notifier); 4761 4762 return 0; 4763 } 4764 4765 static void nvme_sq_notifier(EventNotifier *e) 4766 { 4767 NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier); 4768 4769 if (!event_notifier_test_and_clear(e)) { 4770 return; 4771 } 4772 4773 nvme_process_sq(sq); 4774 } 4775 4776 static int nvme_init_sq_ioeventfd(NvmeSQueue *sq) 4777 { 4778 NvmeCtrl *n = sq->ctrl; 4779 uint16_t offset = sq->sqid << 3; 4780 int ret; 4781 4782 ret = event_notifier_init(&sq->notifier, 0); 4783 if (ret < 0) { 4784 return ret; 4785 } 4786 4787 event_notifier_set_handler(&sq->notifier, nvme_sq_notifier); 4788 memory_region_add_eventfd(&n->iomem, 4789 0x1000 + offset, 4, false, 0, &sq->notifier); 4790 4791 return 0; 4792 } 4793 4794 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) 4795 { 4796 uint16_t offset = sq->sqid << 3; 4797 4798 n->sq[sq->sqid] = NULL; 4799 qemu_bh_delete(sq->bh); 4800 if (sq->ioeventfd_enabled) { 4801 memory_region_del_eventfd(&n->iomem, 4802 0x1000 + offset, 4, false, 0, &sq->notifier); 4803 event_notifier_set_handler(&sq->notifier, NULL); 4804 event_notifier_cleanup(&sq->notifier); 4805 } 4806 g_free(sq->io_req); 4807 if (sq->sqid) { 4808 g_free(sq); 4809 } 4810 } 4811 4812 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req) 4813 { 4814 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd; 4815 NvmeRequest *r, *next; 4816 NvmeSQueue *sq; 4817 NvmeCQueue *cq; 4818 uint16_t qid = le16_to_cpu(c->qid); 4819 4820 if (unlikely(!qid || nvme_check_sqid(n, qid))) { 4821 trace_pci_nvme_err_invalid_del_sq(qid); 4822 return NVME_INVALID_QID | NVME_DNR; 4823 } 4824 4825 trace_pci_nvme_del_sq(qid); 4826 4827 sq = n->sq[qid]; 4828 while (!QTAILQ_EMPTY(&sq->out_req_list)) { 4829 r = QTAILQ_FIRST(&sq->out_req_list); 4830 assert(r->aiocb); 4831 r->status = NVME_CMD_ABORT_SQ_DEL; 4832 blk_aio_cancel(r->aiocb); 4833 } 4834 4835 assert(QTAILQ_EMPTY(&sq->out_req_list)); 4836 4837 if (!nvme_check_cqid(n, sq->cqid)) { 4838 cq = n->cq[sq->cqid]; 4839 QTAILQ_REMOVE(&cq->sq_list, sq, entry); 4840 4841 nvme_post_cqes(cq); 4842 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) { 4843 if (r->sq == sq) { 4844 QTAILQ_REMOVE(&cq->req_list, r, entry); 4845 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry); 4846 } 4847 } 4848 } 4849 4850 nvme_free_sq(sq, n); 4851 return NVME_SUCCESS; 4852 } 4853 4854 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, 4855 uint16_t sqid, uint16_t cqid, uint16_t size) 4856 { 4857 int i; 4858 NvmeCQueue *cq; 4859 4860 sq->ctrl = n; 4861 sq->dma_addr = dma_addr; 4862 sq->sqid = sqid; 4863 sq->size = size; 4864 sq->cqid = cqid; 4865 sq->head = sq->tail = 0; 4866 sq->io_req = g_new0(NvmeRequest, sq->size); 4867 4868 QTAILQ_INIT(&sq->req_list); 4869 QTAILQ_INIT(&sq->out_req_list); 4870 for (i = 0; i < sq->size; i++) { 4871 sq->io_req[i].sq = sq; 4872 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry); 4873 } 4874 4875 sq->bh = qemu_bh_new_guarded(nvme_process_sq, sq, 4876 &DEVICE(sq->ctrl)->mem_reentrancy_guard); 4877 4878 if (n->dbbuf_enabled) { 4879 sq->db_addr = n->dbbuf_dbs + (sqid << 3); 4880 sq->ei_addr = n->dbbuf_eis + (sqid << 3); 4881 4882 if (n->params.ioeventfd && sq->sqid != 0) { 4883 if (!nvme_init_sq_ioeventfd(sq)) { 4884 sq->ioeventfd_enabled = true; 4885 } 4886 } 4887 } 4888 4889 assert(n->cq[cqid]); 4890 cq = n->cq[cqid]; 4891 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry); 4892 n->sq[sqid] = sq; 4893 } 4894 4895 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req) 4896 { 4897 NvmeSQueue *sq; 4898 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd; 4899 4900 uint16_t cqid = le16_to_cpu(c->cqid); 4901 uint16_t sqid = le16_to_cpu(c->sqid); 4902 uint16_t qsize = le16_to_cpu(c->qsize); 4903 uint16_t qflags = le16_to_cpu(c->sq_flags); 4904 uint64_t prp1 = le64_to_cpu(c->prp1); 4905 4906 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags); 4907 4908 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) { 4909 trace_pci_nvme_err_invalid_create_sq_cqid(cqid); 4910 return NVME_INVALID_CQID | NVME_DNR; 4911 } 4912 if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) { 4913 trace_pci_nvme_err_invalid_create_sq_sqid(sqid); 4914 return NVME_INVALID_QID | NVME_DNR; 4915 } 4916 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) { 4917 trace_pci_nvme_err_invalid_create_sq_size(qsize); 4918 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; 4919 } 4920 if (unlikely(prp1 & (n->page_size - 1))) { 4921 trace_pci_nvme_err_invalid_create_sq_addr(prp1); 4922 return NVME_INVALID_PRP_OFFSET | NVME_DNR; 4923 } 4924 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) { 4925 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags)); 4926 return NVME_INVALID_FIELD | NVME_DNR; 4927 } 4928 sq = g_malloc0(sizeof(*sq)); 4929 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1); 4930 return NVME_SUCCESS; 4931 } 4932 4933 struct nvme_stats { 4934 uint64_t units_read; 4935 uint64_t units_written; 4936 uint64_t read_commands; 4937 uint64_t write_commands; 4938 }; 4939 4940 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats) 4941 { 4942 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk); 4943 4944 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ]; 4945 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE]; 4946 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ]; 4947 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE]; 4948 } 4949 4950 static uint16_t nvme_ocp_extended_smart_info(NvmeCtrl *n, uint8_t rae, 4951 uint32_t buf_len, uint64_t off, 4952 NvmeRequest *req) 4953 { 4954 NvmeNamespace *ns = NULL; 4955 NvmeSmartLogExtended smart_l = { 0 }; 4956 struct nvme_stats stats = { 0 }; 4957 uint32_t trans_len; 4958 4959 if (off >= sizeof(smart_l)) { 4960 return NVME_INVALID_FIELD | NVME_DNR; 4961 } 4962 4963 /* accumulate all stats from all namespaces */ 4964 for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4965 ns = nvme_ns(n, i); 4966 if (ns) { 4967 nvme_set_blk_stats(ns, &stats); 4968 } 4969 } 4970 4971 smart_l.physical_media_units_written[0] = cpu_to_le64(stats.units_written); 4972 smart_l.physical_media_units_read[0] = cpu_to_le64(stats.units_read); 4973 smart_l.log_page_version = 0x0005; 4974 4975 static const uint8_t guid[16] = { 4976 0xC5, 0xAF, 0x10, 0x28, 0xEA, 0xBF, 0xF2, 0xA4, 4977 0x9C, 0x4F, 0x6F, 0x7C, 0xC9, 0x14, 0xD5, 0xAF 4978 }; 4979 memcpy(smart_l.log_page_guid, guid, sizeof(smart_l.log_page_guid)); 4980 4981 if (!rae) { 4982 nvme_clear_events(n, NVME_AER_TYPE_SMART); 4983 } 4984 4985 trans_len = MIN(sizeof(smart_l) - off, buf_len); 4986 return nvme_c2h(n, (uint8_t *) &smart_l + off, trans_len, req); 4987 } 4988 4989 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, 4990 uint64_t off, NvmeRequest *req) 4991 { 4992 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 4993 struct nvme_stats stats = { 0 }; 4994 NvmeSmartLog smart = { 0 }; 4995 uint32_t trans_len; 4996 NvmeNamespace *ns; 4997 time_t current_ms; 4998 uint64_t u_read, u_written; 4999 5000 if (off >= sizeof(smart)) { 5001 return NVME_INVALID_FIELD | NVME_DNR; 5002 } 5003 5004 if (nsid != 0xffffffff) { 5005 ns = nvme_ns(n, nsid); 5006 if (!ns) { 5007 return NVME_INVALID_NSID | NVME_DNR; 5008 } 5009 nvme_set_blk_stats(ns, &stats); 5010 } else { 5011 int i; 5012 5013 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5014 ns = nvme_ns(n, i); 5015 if (!ns) { 5016 continue; 5017 } 5018 nvme_set_blk_stats(ns, &stats); 5019 } 5020 } 5021 5022 trans_len = MIN(sizeof(smart) - off, buf_len); 5023 smart.critical_warning = n->smart_critical_warning; 5024 5025 u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000); 5026 u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000); 5027 5028 smart.data_units_read[0] = cpu_to_le64(u_read); 5029 smart.data_units_written[0] = cpu_to_le64(u_written); 5030 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands); 5031 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands); 5032 5033 smart.temperature = cpu_to_le16(n->temperature); 5034 5035 if ((n->temperature >= n->features.temp_thresh_hi) || 5036 (n->temperature <= n->features.temp_thresh_low)) { 5037 smart.critical_warning |= NVME_SMART_TEMPERATURE; 5038 } 5039 5040 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 5041 smart.power_on_hours[0] = 5042 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60); 5043 5044 if (!rae) { 5045 nvme_clear_events(n, NVME_AER_TYPE_SMART); 5046 } 5047 5048 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req); 5049 } 5050 5051 static uint16_t nvme_endgrp_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, 5052 uint64_t off, NvmeRequest *req) 5053 { 5054 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11); 5055 uint16_t endgrpid = (dw11 >> 16) & 0xffff; 5056 struct nvme_stats stats = {}; 5057 NvmeEndGrpLog info = {}; 5058 int i; 5059 5060 if (!n->subsys || endgrpid != 0x1) { 5061 return NVME_INVALID_FIELD | NVME_DNR; 5062 } 5063 5064 if (off >= sizeof(info)) { 5065 return NVME_INVALID_FIELD | NVME_DNR; 5066 } 5067 5068 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5069 NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i); 5070 if (!ns) { 5071 continue; 5072 } 5073 5074 nvme_set_blk_stats(ns, &stats); 5075 } 5076 5077 info.data_units_read[0] = 5078 cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000)); 5079 info.data_units_written[0] = 5080 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000)); 5081 info.media_units_written[0] = 5082 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000)); 5083 5084 info.host_read_commands[0] = cpu_to_le64(stats.read_commands); 5085 info.host_write_commands[0] = cpu_to_le64(stats.write_commands); 5086 5087 buf_len = MIN(sizeof(info) - off, buf_len); 5088 5089 return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req); 5090 } 5091 5092 5093 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, 5094 NvmeRequest *req) 5095 { 5096 uint32_t trans_len; 5097 NvmeFwSlotInfoLog fw_log = { 5098 .afi = 0x1, 5099 }; 5100 5101 if (off >= sizeof(fw_log)) { 5102 return NVME_INVALID_FIELD | NVME_DNR; 5103 } 5104 5105 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' '); 5106 trans_len = MIN(sizeof(fw_log) - off, buf_len); 5107 5108 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req); 5109 } 5110 5111 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, 5112 uint64_t off, NvmeRequest *req) 5113 { 5114 uint32_t trans_len; 5115 NvmeErrorLog errlog; 5116 5117 if (off >= sizeof(errlog)) { 5118 return NVME_INVALID_FIELD | NVME_DNR; 5119 } 5120 5121 if (!rae) { 5122 nvme_clear_events(n, NVME_AER_TYPE_ERROR); 5123 } 5124 5125 memset(&errlog, 0x0, sizeof(errlog)); 5126 trans_len = MIN(sizeof(errlog) - off, buf_len); 5127 5128 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req); 5129 } 5130 5131 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, 5132 uint64_t off, NvmeRequest *req) 5133 { 5134 uint32_t nslist[1024] = {}; 5135 uint32_t trans_len; 5136 int i = 0; 5137 uint32_t nsid; 5138 5139 if (off >= sizeof(nslist)) { 5140 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist)); 5141 return NVME_INVALID_FIELD | NVME_DNR; 5142 } 5143 5144 trans_len = MIN(sizeof(nslist) - off, buf_len); 5145 5146 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) != 5147 NVME_CHANGED_NSID_SIZE) { 5148 /* 5149 * If more than 1024 namespaces, the first entry in the log page should 5150 * be set to FFFFFFFFh and the others to 0 as spec. 5151 */ 5152 if (i == ARRAY_SIZE(nslist)) { 5153 memset(nslist, 0x0, sizeof(nslist)); 5154 nslist[0] = 0xffffffff; 5155 break; 5156 } 5157 5158 nslist[i++] = nsid; 5159 clear_bit(nsid, n->changed_nsids); 5160 } 5161 5162 /* 5163 * Remove all the remaining list entries in case returns directly due to 5164 * more than 1024 namespaces. 5165 */ 5166 if (nslist[0] == 0xffffffff) { 5167 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE); 5168 } 5169 5170 if (!rae) { 5171 nvme_clear_events(n, NVME_AER_TYPE_NOTICE); 5172 } 5173 5174 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req); 5175 } 5176 5177 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, 5178 uint64_t off, NvmeRequest *req) 5179 { 5180 NvmeEffectsLog log = {}; 5181 const uint32_t *iocs = NULL; 5182 uint32_t trans_len; 5183 5184 if (off >= sizeof(log)) { 5185 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log)); 5186 return NVME_INVALID_FIELD | NVME_DNR; 5187 } 5188 5189 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) { 5190 case NVME_CC_CSS_NVM: 5191 iocs = n->cse.iocs.nvm; 5192 break; 5193 5194 case NVME_CC_CSS_ALL: 5195 switch (csi) { 5196 case NVME_CSI_NVM: 5197 iocs = n->cse.iocs.nvm; 5198 break; 5199 case NVME_CSI_ZONED: 5200 iocs = n->cse.iocs.zoned; 5201 break; 5202 } 5203 5204 break; 5205 } 5206 5207 memcpy(log.acs, n->cse.acs, sizeof(log.acs)); 5208 5209 if (iocs) { 5210 memcpy(log.iocs, iocs, sizeof(log.iocs)); 5211 } 5212 5213 trans_len = MIN(sizeof(log) - off, buf_len); 5214 5215 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req); 5216 } 5217 5218 static uint16_t nvme_vendor_specific_log(NvmeCtrl *n, uint8_t rae, 5219 uint32_t buf_len, uint64_t off, 5220 NvmeRequest *req, uint8_t lid) 5221 { 5222 switch (lid) { 5223 case NVME_OCP_EXTENDED_SMART_INFO: 5224 if (n->params.ocp) { 5225 return nvme_ocp_extended_smart_info(n, rae, buf_len, off, req); 5226 } 5227 break; 5228 /* add a case for each additional vendor specific log id */ 5229 } 5230 5231 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid); 5232 return NVME_INVALID_FIELD | NVME_DNR; 5233 } 5234 5235 static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss) 5236 { 5237 size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr) 5238 + vss; 5239 return ROUND_UP(entry_siz, 8); 5240 } 5241 5242 static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len, 5243 uint64_t off, NvmeRequest *req) 5244 { 5245 uint32_t log_size, trans_len; 5246 g_autofree uint8_t *buf = NULL; 5247 NvmeFdpDescrHdr *hdr; 5248 NvmeRuhDescr *ruhd; 5249 NvmeEnduranceGroup *endgrp; 5250 NvmeFdpConfsHdr *log; 5251 size_t nruh, fdp_descr_size; 5252 int i; 5253 5254 if (endgrpid != 1 || !n->subsys) { 5255 return NVME_INVALID_FIELD | NVME_DNR; 5256 } 5257 5258 endgrp = &n->subsys->endgrp; 5259 5260 if (endgrp->fdp.enabled) { 5261 nruh = endgrp->fdp.nruh; 5262 } else { 5263 nruh = 1; 5264 } 5265 5266 fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS); 5267 log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size; 5268 5269 if (off >= log_size) { 5270 return NVME_INVALID_FIELD | NVME_DNR; 5271 } 5272 5273 trans_len = MIN(log_size - off, buf_len); 5274 5275 buf = g_malloc0(log_size); 5276 log = (NvmeFdpConfsHdr *)buf; 5277 hdr = (NvmeFdpDescrHdr *)(log + 1); 5278 ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr)); 5279 5280 log->num_confs = cpu_to_le16(0); 5281 log->size = cpu_to_le32(log_size); 5282 5283 hdr->descr_size = cpu_to_le16(fdp_descr_size); 5284 if (endgrp->fdp.enabled) { 5285 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1); 5286 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif); 5287 hdr->nrg = cpu_to_le16(endgrp->fdp.nrg); 5288 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh); 5289 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1); 5290 hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES); 5291 hdr->runs = cpu_to_le64(endgrp->fdp.runs); 5292 5293 for (i = 0; i < nruh; i++) { 5294 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED; 5295 ruhd++; 5296 } 5297 } else { 5298 /* 1 bit for RUH in PIF -> 2 RUHs max. */ 5299 hdr->nrg = cpu_to_le16(1); 5300 hdr->nruh = cpu_to_le16(1); 5301 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1); 5302 hdr->nnss = cpu_to_le32(1); 5303 hdr->runs = cpu_to_le64(96 * MiB); 5304 5305 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED; 5306 } 5307 5308 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req); 5309 } 5310 5311 static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid, 5312 uint32_t dw10, uint32_t dw12, 5313 uint32_t buf_len, uint64_t off, 5314 NvmeRequest *req) 5315 { 5316 NvmeRuHandle *ruh; 5317 NvmeRuhuLog *hdr; 5318 NvmeRuhuDescr *ruhud; 5319 NvmeEnduranceGroup *endgrp; 5320 g_autofree uint8_t *buf = NULL; 5321 uint32_t log_size, trans_len; 5322 uint16_t i; 5323 5324 if (endgrpid != 1 || !n->subsys) { 5325 return NVME_INVALID_FIELD | NVME_DNR; 5326 } 5327 5328 endgrp = &n->subsys->endgrp; 5329 5330 if (!endgrp->fdp.enabled) { 5331 return NVME_FDP_DISABLED | NVME_DNR; 5332 } 5333 5334 log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr); 5335 5336 if (off >= log_size) { 5337 return NVME_INVALID_FIELD | NVME_DNR; 5338 } 5339 5340 trans_len = MIN(log_size - off, buf_len); 5341 5342 buf = g_malloc0(log_size); 5343 hdr = (NvmeRuhuLog *)buf; 5344 ruhud = (NvmeRuhuDescr *)(hdr + 1); 5345 5346 ruh = endgrp->fdp.ruhs; 5347 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh); 5348 5349 for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) { 5350 ruhud->ruha = ruh->ruha; 5351 } 5352 5353 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req); 5354 } 5355 5356 static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len, 5357 uint64_t off, NvmeRequest *req) 5358 { 5359 NvmeEnduranceGroup *endgrp; 5360 NvmeFdpStatsLog log = {}; 5361 uint32_t trans_len; 5362 5363 if (off >= sizeof(NvmeFdpStatsLog)) { 5364 return NVME_INVALID_FIELD | NVME_DNR; 5365 } 5366 5367 if (endgrpid != 1 || !n->subsys) { 5368 return NVME_INVALID_FIELD | NVME_DNR; 5369 } 5370 5371 if (!n->subsys->endgrp.fdp.enabled) { 5372 return NVME_FDP_DISABLED | NVME_DNR; 5373 } 5374 5375 endgrp = &n->subsys->endgrp; 5376 5377 trans_len = MIN(sizeof(log) - off, buf_len); 5378 5379 /* spec value is 128 bit, we only use 64 bit */ 5380 log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw); 5381 log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw); 5382 log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe); 5383 5384 return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req); 5385 } 5386 5387 static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid, 5388 uint32_t buf_len, uint64_t off, 5389 NvmeRequest *req) 5390 { 5391 NvmeEnduranceGroup *endgrp; 5392 NvmeCmd *cmd = &req->cmd; 5393 bool host_events = (cmd->cdw10 >> 8) & 0x1; 5394 uint32_t log_size, trans_len; 5395 NvmeFdpEventBuffer *ebuf; 5396 g_autofree NvmeFdpEventsLog *elog = NULL; 5397 NvmeFdpEvent *event; 5398 5399 if (endgrpid != 1 || !n->subsys) { 5400 return NVME_INVALID_FIELD | NVME_DNR; 5401 } 5402 5403 endgrp = &n->subsys->endgrp; 5404 5405 if (!endgrp->fdp.enabled) { 5406 return NVME_FDP_DISABLED | NVME_DNR; 5407 } 5408 5409 if (host_events) { 5410 ebuf = &endgrp->fdp.host_events; 5411 } else { 5412 ebuf = &endgrp->fdp.ctrl_events; 5413 } 5414 5415 log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent); 5416 5417 if (off >= log_size) { 5418 return NVME_INVALID_FIELD | NVME_DNR; 5419 } 5420 5421 trans_len = MIN(log_size - off, buf_len); 5422 elog = g_malloc0(log_size); 5423 elog->num_events = cpu_to_le32(ebuf->nelems); 5424 event = (NvmeFdpEvent *)(elog + 1); 5425 5426 if (ebuf->nelems && ebuf->start == ebuf->next) { 5427 unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start); 5428 /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */ 5429 memcpy(event, &ebuf->events[ebuf->start], 5430 sizeof(NvmeFdpEvent) * nelems); 5431 memcpy(event + nelems, ebuf->events, 5432 sizeof(NvmeFdpEvent) * ebuf->next); 5433 } else if (ebuf->start < ebuf->next) { 5434 memcpy(event, &ebuf->events[ebuf->start], 5435 sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start)); 5436 } 5437 5438 return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req); 5439 } 5440 5441 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) 5442 { 5443 NvmeCmd *cmd = &req->cmd; 5444 5445 uint32_t dw10 = le32_to_cpu(cmd->cdw10); 5446 uint32_t dw11 = le32_to_cpu(cmd->cdw11); 5447 uint32_t dw12 = le32_to_cpu(cmd->cdw12); 5448 uint32_t dw13 = le32_to_cpu(cmd->cdw13); 5449 uint8_t lid = dw10 & 0xff; 5450 uint8_t lsp = (dw10 >> 8) & 0xf; 5451 uint8_t rae = (dw10 >> 15) & 0x1; 5452 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24; 5453 uint32_t numdl, numdu, lspi; 5454 uint64_t off, lpol, lpou; 5455 size_t len; 5456 uint16_t status; 5457 5458 numdl = (dw10 >> 16); 5459 numdu = (dw11 & 0xffff); 5460 lspi = (dw11 >> 16); 5461 lpol = dw12; 5462 lpou = dw13; 5463 5464 len = (((numdu << 16) | numdl) + 1) << 2; 5465 off = (lpou << 32ULL) | lpol; 5466 5467 if (off & 0x3) { 5468 return NVME_INVALID_FIELD | NVME_DNR; 5469 } 5470 5471 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off); 5472 5473 status = nvme_check_mdts(n, len); 5474 if (status) { 5475 return status; 5476 } 5477 5478 switch (lid) { 5479 case NVME_LOG_ERROR_INFO: 5480 return nvme_error_info(n, rae, len, off, req); 5481 case NVME_LOG_SMART_INFO: 5482 return nvme_smart_info(n, rae, len, off, req); 5483 case NVME_LOG_FW_SLOT_INFO: 5484 return nvme_fw_log_info(n, len, off, req); 5485 case NVME_LOG_VENDOR_START...NVME_LOG_VENDOR_END: 5486 return nvme_vendor_specific_log(n, rae, len, off, req, lid); 5487 case NVME_LOG_CHANGED_NSLIST: 5488 return nvme_changed_nslist(n, rae, len, off, req); 5489 case NVME_LOG_CMD_EFFECTS: 5490 return nvme_cmd_effects(n, csi, len, off, req); 5491 case NVME_LOG_ENDGRP: 5492 return nvme_endgrp_info(n, rae, len, off, req); 5493 case NVME_LOG_FDP_CONFS: 5494 return nvme_fdp_confs(n, lspi, len, off, req); 5495 case NVME_LOG_FDP_RUH_USAGE: 5496 return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req); 5497 case NVME_LOG_FDP_STATS: 5498 return nvme_fdp_stats(n, lspi, len, off, req); 5499 case NVME_LOG_FDP_EVENTS: 5500 return nvme_fdp_events(n, lspi, len, off, req); 5501 default: 5502 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid); 5503 return NVME_INVALID_FIELD | NVME_DNR; 5504 } 5505 } 5506 5507 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) 5508 { 5509 PCIDevice *pci = PCI_DEVICE(n); 5510 uint16_t offset = (cq->cqid << 3) + (1 << 2); 5511 5512 n->cq[cq->cqid] = NULL; 5513 qemu_bh_delete(cq->bh); 5514 if (cq->ioeventfd_enabled) { 5515 memory_region_del_eventfd(&n->iomem, 5516 0x1000 + offset, 4, false, 0, &cq->notifier); 5517 event_notifier_set_handler(&cq->notifier, NULL); 5518 event_notifier_cleanup(&cq->notifier); 5519 } 5520 if (msix_enabled(pci) && cq->irq_enabled) { 5521 msix_vector_unuse(pci, cq->vector); 5522 } 5523 if (cq->cqid) { 5524 g_free(cq); 5525 } 5526 } 5527 5528 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req) 5529 { 5530 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd; 5531 NvmeCQueue *cq; 5532 uint16_t qid = le16_to_cpu(c->qid); 5533 5534 if (unlikely(!qid || nvme_check_cqid(n, qid))) { 5535 trace_pci_nvme_err_invalid_del_cq_cqid(qid); 5536 return NVME_INVALID_CQID | NVME_DNR; 5537 } 5538 5539 cq = n->cq[qid]; 5540 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) { 5541 trace_pci_nvme_err_invalid_del_cq_notempty(qid); 5542 return NVME_INVALID_QUEUE_DEL; 5543 } 5544 5545 if (cq->irq_enabled && cq->tail != cq->head) { 5546 n->cq_pending--; 5547 } 5548 5549 nvme_irq_deassert(n, cq); 5550 trace_pci_nvme_del_cq(qid); 5551 nvme_free_cq(cq, n); 5552 return NVME_SUCCESS; 5553 } 5554 5555 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, 5556 uint16_t cqid, uint16_t vector, uint16_t size, 5557 uint16_t irq_enabled) 5558 { 5559 PCIDevice *pci = PCI_DEVICE(n); 5560 5561 if (msix_enabled(pci) && irq_enabled) { 5562 msix_vector_use(pci, vector); 5563 } 5564 5565 cq->ctrl = n; 5566 cq->cqid = cqid; 5567 cq->size = size; 5568 cq->dma_addr = dma_addr; 5569 cq->phase = 1; 5570 cq->irq_enabled = irq_enabled; 5571 cq->vector = vector; 5572 cq->head = cq->tail = 0; 5573 QTAILQ_INIT(&cq->req_list); 5574 QTAILQ_INIT(&cq->sq_list); 5575 if (n->dbbuf_enabled) { 5576 cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2); 5577 cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2); 5578 5579 if (n->params.ioeventfd && cqid != 0) { 5580 if (!nvme_init_cq_ioeventfd(cq)) { 5581 cq->ioeventfd_enabled = true; 5582 } 5583 } 5584 } 5585 n->cq[cqid] = cq; 5586 cq->bh = qemu_bh_new_guarded(nvme_post_cqes, cq, 5587 &DEVICE(cq->ctrl)->mem_reentrancy_guard); 5588 } 5589 5590 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) 5591 { 5592 NvmeCQueue *cq; 5593 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd; 5594 uint16_t cqid = le16_to_cpu(c->cqid); 5595 uint16_t vector = le16_to_cpu(c->irq_vector); 5596 uint16_t qsize = le16_to_cpu(c->qsize); 5597 uint16_t qflags = le16_to_cpu(c->cq_flags); 5598 uint64_t prp1 = le64_to_cpu(c->prp1); 5599 uint32_t cc = ldq_le_p(&n->bar.cc); 5600 uint8_t iocqes = NVME_CC_IOCQES(cc); 5601 uint8_t iosqes = NVME_CC_IOSQES(cc); 5602 5603 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags, 5604 NVME_CQ_FLAGS_IEN(qflags) != 0); 5605 5606 if (iosqes != NVME_SQES || iocqes != NVME_CQES) { 5607 trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes); 5608 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; 5609 } 5610 5611 if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) { 5612 trace_pci_nvme_err_invalid_create_cq_cqid(cqid); 5613 return NVME_INVALID_QID | NVME_DNR; 5614 } 5615 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) { 5616 trace_pci_nvme_err_invalid_create_cq_size(qsize); 5617 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; 5618 } 5619 if (unlikely(prp1 & (n->page_size - 1))) { 5620 trace_pci_nvme_err_invalid_create_cq_addr(prp1); 5621 return NVME_INVALID_PRP_OFFSET | NVME_DNR; 5622 } 5623 if (unlikely(!msix_enabled(PCI_DEVICE(n)) && vector)) { 5624 trace_pci_nvme_err_invalid_create_cq_vector(vector); 5625 return NVME_INVALID_IRQ_VECTOR | NVME_DNR; 5626 } 5627 if (unlikely(vector >= n->conf_msix_qsize)) { 5628 trace_pci_nvme_err_invalid_create_cq_vector(vector); 5629 return NVME_INVALID_IRQ_VECTOR | NVME_DNR; 5630 } 5631 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) { 5632 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags)); 5633 return NVME_INVALID_FIELD | NVME_DNR; 5634 } 5635 5636 cq = g_malloc0(sizeof(*cq)); 5637 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1, 5638 NVME_CQ_FLAGS_IEN(qflags)); 5639 5640 /* 5641 * It is only required to set qs_created when creating a completion queue; 5642 * creating a submission queue without a matching completion queue will 5643 * fail. 5644 */ 5645 n->qs_created = true; 5646 return NVME_SUCCESS; 5647 } 5648 5649 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req) 5650 { 5651 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; 5652 5653 return nvme_c2h(n, id, sizeof(id), req); 5654 } 5655 5656 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) 5657 { 5658 trace_pci_nvme_identify_ctrl(); 5659 5660 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req); 5661 } 5662 5663 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) 5664 { 5665 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 5666 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; 5667 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id; 5668 5669 trace_pci_nvme_identify_ctrl_csi(c->csi); 5670 5671 switch (c->csi) { 5672 case NVME_CSI_NVM: 5673 id_nvm->vsl = n->params.vsl; 5674 id_nvm->dmrl = NVME_ID_CTRL_NVM_DMRL_MAX; 5675 id_nvm->dmrsl = cpu_to_le32(n->dmrsl); 5676 id_nvm->dmsl = NVME_ID_CTRL_NVM_DMRL_MAX * n->dmrsl; 5677 break; 5678 5679 case NVME_CSI_ZONED: 5680 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl; 5681 break; 5682 5683 default: 5684 return NVME_INVALID_FIELD | NVME_DNR; 5685 } 5686 5687 return nvme_c2h(n, id, sizeof(id), req); 5688 } 5689 5690 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active) 5691 { 5692 NvmeNamespace *ns; 5693 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 5694 uint32_t nsid = le32_to_cpu(c->nsid); 5695 5696 trace_pci_nvme_identify_ns(nsid); 5697 5698 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 5699 return NVME_INVALID_NSID | NVME_DNR; 5700 } 5701 5702 ns = nvme_ns(n, nsid); 5703 if (unlikely(!ns)) { 5704 if (!active) { 5705 ns = nvme_subsys_ns(n->subsys, nsid); 5706 if (!ns) { 5707 return nvme_rpt_empty_id_struct(n, req); 5708 } 5709 } else { 5710 return nvme_rpt_empty_id_struct(n, req); 5711 } 5712 } 5713 5714 if (active || ns->csi == NVME_CSI_NVM) { 5715 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req); 5716 } 5717 5718 return NVME_INVALID_IOCS | NVME_DNR; 5719 } 5720 5721 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req, 5722 bool attached) 5723 { 5724 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 5725 uint32_t nsid = le32_to_cpu(c->nsid); 5726 uint16_t min_id = le16_to_cpu(c->ctrlid); 5727 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; 5728 uint16_t *ids = &list[1]; 5729 NvmeNamespace *ns; 5730 NvmeCtrl *ctrl; 5731 int cntlid, nr_ids = 0; 5732 5733 trace_pci_nvme_identify_ctrl_list(c->cns, min_id); 5734 5735 if (!n->subsys) { 5736 return NVME_INVALID_FIELD | NVME_DNR; 5737 } 5738 5739 if (attached) { 5740 if (nsid == NVME_NSID_BROADCAST) { 5741 return NVME_INVALID_FIELD | NVME_DNR; 5742 } 5743 5744 ns = nvme_subsys_ns(n->subsys, nsid); 5745 if (!ns) { 5746 return NVME_INVALID_FIELD | NVME_DNR; 5747 } 5748 } 5749 5750 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) { 5751 ctrl = nvme_subsys_ctrl(n->subsys, cntlid); 5752 if (!ctrl) { 5753 continue; 5754 } 5755 5756 if (attached && !nvme_ns(ctrl, nsid)) { 5757 continue; 5758 } 5759 5760 ids[nr_ids++] = cntlid; 5761 } 5762 5763 list[0] = nr_ids; 5764 5765 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req); 5766 } 5767 5768 static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req) 5769 { 5770 trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid)); 5771 5772 return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap, 5773 sizeof(NvmePriCtrlCap), req); 5774 } 5775 5776 static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req) 5777 { 5778 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 5779 uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid); 5780 uint16_t min_id = le16_to_cpu(c->ctrlid); 5781 uint8_t num_sec_ctrl = n->nr_sec_ctrls; 5782 NvmeSecCtrlList list = {0}; 5783 uint8_t i; 5784 5785 for (i = 0; i < num_sec_ctrl; i++) { 5786 if (n->sec_ctrl_list[i].scid >= min_id) { 5787 list.numcntl = MIN(num_sec_ctrl - i, 127); 5788 memcpy(&list.sec, n->sec_ctrl_list + i, 5789 list.numcntl * sizeof(NvmeSecCtrlEntry)); 5790 break; 5791 } 5792 } 5793 5794 trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl); 5795 5796 return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req); 5797 } 5798 5799 static uint16_t nvme_identify_ns_ind(NvmeCtrl *n, NvmeRequest *req, bool alloc) 5800 { 5801 NvmeNamespace *ns; 5802 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 5803 uint32_t nsid = le32_to_cpu(c->nsid); 5804 5805 trace_pci_nvme_identify_ns_ind(nsid); 5806 5807 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 5808 return NVME_INVALID_NSID | NVME_DNR; 5809 } 5810 5811 ns = nvme_ns(n, nsid); 5812 if (unlikely(!ns)) { 5813 if (alloc) { 5814 ns = nvme_subsys_ns(n->subsys, nsid); 5815 if (!ns) { 5816 return nvme_rpt_empty_id_struct(n, req); 5817 } 5818 } else { 5819 return nvme_rpt_empty_id_struct(n, req); 5820 } 5821 } 5822 5823 return nvme_c2h(n, (uint8_t *)&ns->id_ns_ind, sizeof(NvmeIdNsInd), req); 5824 } 5825 5826 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req, 5827 bool active) 5828 { 5829 NvmeNamespace *ns; 5830 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 5831 uint32_t nsid = le32_to_cpu(c->nsid); 5832 5833 trace_pci_nvme_identify_ns_csi(nsid, c->csi); 5834 5835 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 5836 return NVME_INVALID_NSID | NVME_DNR; 5837 } 5838 5839 ns = nvme_ns(n, nsid); 5840 if (unlikely(!ns)) { 5841 if (!active) { 5842 ns = nvme_subsys_ns(n->subsys, nsid); 5843 if (!ns) { 5844 return nvme_rpt_empty_id_struct(n, req); 5845 } 5846 } else { 5847 return nvme_rpt_empty_id_struct(n, req); 5848 } 5849 } 5850 5851 if (c->csi == NVME_CSI_NVM) { 5852 return nvme_c2h(n, (uint8_t *)&ns->id_ns_nvm, sizeof(NvmeIdNsNvm), 5853 req); 5854 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) { 5855 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned), 5856 req); 5857 } 5858 5859 return NVME_INVALID_FIELD | NVME_DNR; 5860 } 5861 5862 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req, 5863 bool active) 5864 { 5865 NvmeNamespace *ns; 5866 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 5867 uint32_t min_nsid = le32_to_cpu(c->nsid); 5868 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 5869 static const int data_len = sizeof(list); 5870 uint32_t *list_ptr = (uint32_t *)list; 5871 int i, j = 0; 5872 5873 trace_pci_nvme_identify_nslist(min_nsid); 5874 5875 /* 5876 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values 5877 * since the Active Namespace ID List should return namespaces with ids 5878 * *higher* than the NSID specified in the command. This is also specified 5879 * in the spec (NVM Express v1.3d, Section 5.15.4). 5880 */ 5881 if (min_nsid >= NVME_NSID_BROADCAST - 1) { 5882 return NVME_INVALID_NSID | NVME_DNR; 5883 } 5884 5885 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5886 ns = nvme_ns(n, i); 5887 if (!ns) { 5888 if (!active) { 5889 ns = nvme_subsys_ns(n->subsys, i); 5890 if (!ns) { 5891 continue; 5892 } 5893 } else { 5894 continue; 5895 } 5896 } 5897 if (ns->params.nsid <= min_nsid) { 5898 continue; 5899 } 5900 list_ptr[j++] = cpu_to_le32(ns->params.nsid); 5901 if (j == data_len / sizeof(uint32_t)) { 5902 break; 5903 } 5904 } 5905 5906 return nvme_c2h(n, list, data_len, req); 5907 } 5908 5909 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req, 5910 bool active) 5911 { 5912 NvmeNamespace *ns; 5913 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 5914 uint32_t min_nsid = le32_to_cpu(c->nsid); 5915 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 5916 static const int data_len = sizeof(list); 5917 uint32_t *list_ptr = (uint32_t *)list; 5918 int i, j = 0; 5919 5920 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi); 5921 5922 /* 5923 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid. 5924 */ 5925 if (min_nsid >= NVME_NSID_BROADCAST - 1) { 5926 return NVME_INVALID_NSID | NVME_DNR; 5927 } 5928 5929 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) { 5930 return NVME_INVALID_FIELD | NVME_DNR; 5931 } 5932 5933 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5934 ns = nvme_ns(n, i); 5935 if (!ns) { 5936 if (!active) { 5937 ns = nvme_subsys_ns(n->subsys, i); 5938 if (!ns) { 5939 continue; 5940 } 5941 } else { 5942 continue; 5943 } 5944 } 5945 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) { 5946 continue; 5947 } 5948 list_ptr[j++] = cpu_to_le32(ns->params.nsid); 5949 if (j == data_len / sizeof(uint32_t)) { 5950 break; 5951 } 5952 } 5953 5954 return nvme_c2h(n, list, data_len, req); 5955 } 5956 5957 static uint16_t nvme_endurance_group_list(NvmeCtrl *n, NvmeRequest *req) 5958 { 5959 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; 5960 uint16_t *nr_ids = &list[0]; 5961 uint16_t *ids = &list[1]; 5962 uint16_t endgid = le32_to_cpu(req->cmd.cdw11) & 0xffff; 5963 5964 /* 5965 * The current nvme-subsys only supports Endurance Group #1. 5966 */ 5967 if (!endgid) { 5968 *nr_ids = 1; 5969 ids[0] = 1; 5970 } else { 5971 *nr_ids = 0; 5972 } 5973 5974 return nvme_c2h(n, list, sizeof(list), req); 5975 } 5976 5977 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) 5978 { 5979 NvmeNamespace *ns; 5980 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 5981 uint32_t nsid = le32_to_cpu(c->nsid); 5982 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 5983 uint8_t *pos = list; 5984 struct { 5985 NvmeIdNsDescr hdr; 5986 uint8_t v[NVME_NIDL_UUID]; 5987 } QEMU_PACKED uuid = {}; 5988 struct { 5989 NvmeIdNsDescr hdr; 5990 uint8_t v[NVME_NIDL_NGUID]; 5991 } QEMU_PACKED nguid = {}; 5992 struct { 5993 NvmeIdNsDescr hdr; 5994 uint64_t v; 5995 } QEMU_PACKED eui64 = {}; 5996 struct { 5997 NvmeIdNsDescr hdr; 5998 uint8_t v; 5999 } QEMU_PACKED csi = {}; 6000 6001 trace_pci_nvme_identify_ns_descr_list(nsid); 6002 6003 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 6004 return NVME_INVALID_NSID | NVME_DNR; 6005 } 6006 6007 ns = nvme_ns(n, nsid); 6008 if (unlikely(!ns)) { 6009 return NVME_INVALID_FIELD | NVME_DNR; 6010 } 6011 6012 if (!qemu_uuid_is_null(&ns->params.uuid)) { 6013 uuid.hdr.nidt = NVME_NIDT_UUID; 6014 uuid.hdr.nidl = NVME_NIDL_UUID; 6015 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID); 6016 memcpy(pos, &uuid, sizeof(uuid)); 6017 pos += sizeof(uuid); 6018 } 6019 6020 if (!nvme_nguid_is_null(&ns->params.nguid)) { 6021 nguid.hdr.nidt = NVME_NIDT_NGUID; 6022 nguid.hdr.nidl = NVME_NIDL_NGUID; 6023 memcpy(nguid.v, ns->params.nguid.data, NVME_NIDL_NGUID); 6024 memcpy(pos, &nguid, sizeof(nguid)); 6025 pos += sizeof(nguid); 6026 } 6027 6028 if (ns->params.eui64) { 6029 eui64.hdr.nidt = NVME_NIDT_EUI64; 6030 eui64.hdr.nidl = NVME_NIDL_EUI64; 6031 eui64.v = cpu_to_be64(ns->params.eui64); 6032 memcpy(pos, &eui64, sizeof(eui64)); 6033 pos += sizeof(eui64); 6034 } 6035 6036 csi.hdr.nidt = NVME_NIDT_CSI; 6037 csi.hdr.nidl = NVME_NIDL_CSI; 6038 csi.v = ns->csi; 6039 memcpy(pos, &csi, sizeof(csi)); 6040 pos += sizeof(csi); 6041 6042 return nvme_c2h(n, list, sizeof(list), req); 6043 } 6044 6045 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req) 6046 { 6047 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 6048 static const int data_len = sizeof(list); 6049 6050 trace_pci_nvme_identify_cmd_set(); 6051 6052 NVME_SET_CSI(*list, NVME_CSI_NVM); 6053 NVME_SET_CSI(*list, NVME_CSI_ZONED); 6054 6055 return nvme_c2h(n, list, data_len, req); 6056 } 6057 6058 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) 6059 { 6060 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 6061 6062 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid), 6063 c->csi); 6064 6065 switch (c->cns) { 6066 case NVME_ID_CNS_NS: 6067 return nvme_identify_ns(n, req, true); 6068 case NVME_ID_CNS_NS_PRESENT: 6069 return nvme_identify_ns(n, req, false); 6070 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST: 6071 return nvme_identify_ctrl_list(n, req, true); 6072 case NVME_ID_CNS_CTRL_LIST: 6073 return nvme_identify_ctrl_list(n, req, false); 6074 case NVME_ID_CNS_PRIMARY_CTRL_CAP: 6075 return nvme_identify_pri_ctrl_cap(n, req); 6076 case NVME_ID_CNS_SECONDARY_CTRL_LIST: 6077 return nvme_identify_sec_ctrl_list(n, req); 6078 case NVME_ID_CNS_CS_NS: 6079 return nvme_identify_ns_csi(n, req, true); 6080 case NVME_ID_CNS_CS_IND_NS: 6081 return nvme_identify_ns_ind(n, req, false); 6082 case NVME_ID_CNS_CS_IND_NS_ALLOCATED: 6083 return nvme_identify_ns_ind(n, req, true); 6084 case NVME_ID_CNS_CS_NS_PRESENT: 6085 return nvme_identify_ns_csi(n, req, false); 6086 case NVME_ID_CNS_CTRL: 6087 return nvme_identify_ctrl(n, req); 6088 case NVME_ID_CNS_CS_CTRL: 6089 return nvme_identify_ctrl_csi(n, req); 6090 case NVME_ID_CNS_NS_ACTIVE_LIST: 6091 return nvme_identify_nslist(n, req, true); 6092 case NVME_ID_CNS_NS_PRESENT_LIST: 6093 return nvme_identify_nslist(n, req, false); 6094 case NVME_ID_CNS_CS_NS_ACTIVE_LIST: 6095 return nvme_identify_nslist_csi(n, req, true); 6096 case NVME_ID_CNS_ENDURANCE_GROUP_LIST: 6097 return nvme_endurance_group_list(n, req); 6098 case NVME_ID_CNS_CS_NS_PRESENT_LIST: 6099 return nvme_identify_nslist_csi(n, req, false); 6100 case NVME_ID_CNS_NS_DESCR_LIST: 6101 return nvme_identify_ns_descr_list(n, req); 6102 case NVME_ID_CNS_IO_COMMAND_SET: 6103 return nvme_identify_cmd_set(n, req); 6104 default: 6105 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns)); 6106 return NVME_INVALID_FIELD | NVME_DNR; 6107 } 6108 } 6109 6110 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req) 6111 { 6112 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff; 6113 uint16_t cid = (le32_to_cpu(req->cmd.cdw10) >> 16) & 0xffff; 6114 NvmeSQueue *sq = n->sq[sqid]; 6115 NvmeRequest *r, *next; 6116 int i; 6117 6118 req->cqe.result = 1; 6119 if (nvme_check_sqid(n, sqid)) { 6120 return NVME_INVALID_FIELD | NVME_DNR; 6121 } 6122 6123 if (sqid == 0) { 6124 for (i = 0; i < n->outstanding_aers; i++) { 6125 NvmeRequest *re = n->aer_reqs[i]; 6126 if (re->cqe.cid == cid) { 6127 memmove(n->aer_reqs + i, n->aer_reqs + i + 1, 6128 (n->outstanding_aers - i - 1) * sizeof(NvmeRequest *)); 6129 n->outstanding_aers--; 6130 re->status = NVME_CMD_ABORT_REQ; 6131 req->cqe.result = 0; 6132 nvme_enqueue_req_completion(&n->admin_cq, re); 6133 return NVME_SUCCESS; 6134 } 6135 } 6136 } 6137 6138 QTAILQ_FOREACH_SAFE(r, &sq->out_req_list, entry, next) { 6139 if (r->cqe.cid == cid) { 6140 if (r->aiocb) { 6141 r->status = NVME_CMD_ABORT_REQ; 6142 blk_aio_cancel_async(r->aiocb); 6143 } 6144 break; 6145 } 6146 } 6147 6148 return NVME_SUCCESS; 6149 } 6150 6151 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts) 6152 { 6153 trace_pci_nvme_setfeat_timestamp(ts); 6154 6155 n->host_timestamp = le64_to_cpu(ts); 6156 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 6157 } 6158 6159 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n) 6160 { 6161 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 6162 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms; 6163 6164 union nvme_timestamp { 6165 struct { 6166 uint64_t timestamp:48; 6167 uint64_t sync:1; 6168 uint64_t origin:3; 6169 uint64_t rsvd1:12; 6170 }; 6171 uint64_t all; 6172 }; 6173 6174 union nvme_timestamp ts; 6175 ts.all = 0; 6176 ts.timestamp = n->host_timestamp + elapsed_time; 6177 6178 /* If the host timestamp is non-zero, set the timestamp origin */ 6179 ts.origin = n->host_timestamp ? 0x01 : 0x00; 6180 6181 trace_pci_nvme_getfeat_timestamp(ts.all); 6182 6183 return cpu_to_le64(ts.all); 6184 } 6185 6186 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) 6187 { 6188 uint64_t timestamp = nvme_get_timestamp(n); 6189 6190 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req); 6191 } 6192 6193 static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid, 6194 uint32_t *result) 6195 { 6196 *result = 0; 6197 6198 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) { 6199 return NVME_INVALID_FIELD | NVME_DNR; 6200 } 6201 6202 *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1); 6203 *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0); 6204 6205 return NVME_SUCCESS; 6206 } 6207 6208 static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns, 6209 NvmeRequest *req, uint32_t *result) 6210 { 6211 NvmeCmd *cmd = &req->cmd; 6212 uint32_t cdw11 = le32_to_cpu(cmd->cdw11); 6213 uint16_t ph = cdw11 & 0xffff; 6214 uint8_t noet = (cdw11 >> 16) & 0xff; 6215 uint16_t ruhid, ret; 6216 uint32_t nentries = 0; 6217 uint8_t s_events_ndx = 0; 6218 size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet; 6219 g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz); 6220 NvmeRuHandle *ruh; 6221 NvmeFdpEventDescr *s_event; 6222 6223 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) { 6224 return NVME_FDP_DISABLED | NVME_DNR; 6225 } 6226 6227 if (!nvme_ph_valid(ns, ph)) { 6228 return NVME_INVALID_FIELD | NVME_DNR; 6229 } 6230 6231 ruhid = ns->fdp.phs[ph]; 6232 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid]; 6233 6234 assert(ruh); 6235 6236 if (unlikely(noet == 0)) { 6237 return NVME_INVALID_FIELD | NVME_DNR; 6238 } 6239 6240 for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) { 6241 uint8_t shift = nvme_fdp_evf_shifts[event_type]; 6242 if (!shift && event_type) { 6243 /* 6244 * only first entry (event_type == 0) has a shift value of 0 6245 * other entries are simply unpopulated. 6246 */ 6247 continue; 6248 } 6249 6250 nentries++; 6251 6252 s_event = &s_events[s_events_ndx]; 6253 s_event->evt = event_type; 6254 s_event->evta = (ruh->event_filter >> shift) & 0x1; 6255 6256 /* break if all `noet` entries are filled */ 6257 if ((++s_events_ndx) == noet) { 6258 break; 6259 } 6260 } 6261 6262 ret = nvme_c2h(n, s_events, s_events_siz, req); 6263 if (ret) { 6264 return ret; 6265 } 6266 6267 *result = nentries; 6268 return NVME_SUCCESS; 6269 } 6270 6271 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) 6272 { 6273 NvmeCmd *cmd = &req->cmd; 6274 uint32_t dw10 = le32_to_cpu(cmd->cdw10); 6275 uint32_t dw11 = le32_to_cpu(cmd->cdw11); 6276 uint32_t nsid = le32_to_cpu(cmd->nsid); 6277 uint32_t result = 0; 6278 uint8_t fid = NVME_GETSETFEAT_FID(dw10); 6279 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10); 6280 uint16_t iv; 6281 NvmeNamespace *ns; 6282 int i; 6283 uint16_t endgrpid = 0, ret = NVME_SUCCESS; 6284 6285 static const uint32_t nvme_feature_default[NVME_FID_MAX] = { 6286 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT, 6287 }; 6288 6289 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11); 6290 6291 if (!nvme_feature_support[fid]) { 6292 return NVME_INVALID_FIELD | NVME_DNR; 6293 } 6294 6295 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) { 6296 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 6297 /* 6298 * The Reservation Notification Mask and Reservation Persistence 6299 * features require a status code of Invalid Field in Command when 6300 * NSID is FFFFFFFFh. Since the device does not support those 6301 * features we can always return Invalid Namespace or Format as we 6302 * should do for all other features. 6303 */ 6304 return NVME_INVALID_NSID | NVME_DNR; 6305 } 6306 6307 if (!nvme_ns(n, nsid)) { 6308 return NVME_INVALID_FIELD | NVME_DNR; 6309 } 6310 } 6311 6312 switch (sel) { 6313 case NVME_GETFEAT_SELECT_CURRENT: 6314 break; 6315 case NVME_GETFEAT_SELECT_SAVED: 6316 /* no features are saveable by the controller; fallthrough */ 6317 case NVME_GETFEAT_SELECT_DEFAULT: 6318 goto defaults; 6319 case NVME_GETFEAT_SELECT_CAP: 6320 result = nvme_feature_cap[fid]; 6321 goto out; 6322 } 6323 6324 switch (fid) { 6325 case NVME_TEMPERATURE_THRESHOLD: 6326 result = 0; 6327 6328 /* 6329 * The controller only implements the Composite Temperature sensor, so 6330 * return 0 for all other sensors. 6331 */ 6332 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { 6333 goto out; 6334 } 6335 6336 switch (NVME_TEMP_THSEL(dw11)) { 6337 case NVME_TEMP_THSEL_OVER: 6338 result = n->features.temp_thresh_hi; 6339 goto out; 6340 case NVME_TEMP_THSEL_UNDER: 6341 result = n->features.temp_thresh_low; 6342 goto out; 6343 } 6344 6345 return NVME_INVALID_FIELD | NVME_DNR; 6346 case NVME_ERROR_RECOVERY: 6347 if (!nvme_nsid_valid(n, nsid)) { 6348 return NVME_INVALID_NSID | NVME_DNR; 6349 } 6350 6351 ns = nvme_ns(n, nsid); 6352 if (unlikely(!ns)) { 6353 return NVME_INVALID_FIELD | NVME_DNR; 6354 } 6355 6356 result = ns->features.err_rec; 6357 goto out; 6358 case NVME_VOLATILE_WRITE_CACHE: 6359 result = 0; 6360 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 6361 ns = nvme_ns(n, i); 6362 if (!ns) { 6363 continue; 6364 } 6365 6366 result = blk_enable_write_cache(ns->blkconf.blk); 6367 if (result) { 6368 break; 6369 } 6370 } 6371 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled"); 6372 goto out; 6373 case NVME_ASYNCHRONOUS_EVENT_CONF: 6374 result = n->features.async_config; 6375 goto out; 6376 case NVME_TIMESTAMP: 6377 return nvme_get_feature_timestamp(n, req); 6378 case NVME_HOST_BEHAVIOR_SUPPORT: 6379 return nvme_c2h(n, (uint8_t *)&n->features.hbs, 6380 sizeof(n->features.hbs), req); 6381 case NVME_FDP_MODE: 6382 endgrpid = dw11 & 0xff; 6383 6384 if (endgrpid != 0x1) { 6385 return NVME_INVALID_FIELD | NVME_DNR; 6386 } 6387 6388 ret = nvme_get_feature_fdp(n, endgrpid, &result); 6389 if (ret) { 6390 return ret; 6391 } 6392 goto out; 6393 case NVME_FDP_EVENTS: 6394 if (!nvme_nsid_valid(n, nsid)) { 6395 return NVME_INVALID_NSID | NVME_DNR; 6396 } 6397 6398 ns = nvme_ns(n, nsid); 6399 if (unlikely(!ns)) { 6400 return NVME_INVALID_FIELD | NVME_DNR; 6401 } 6402 6403 ret = nvme_get_feature_fdp_events(n, ns, req, &result); 6404 if (ret) { 6405 return ret; 6406 } 6407 goto out; 6408 default: 6409 break; 6410 } 6411 6412 defaults: 6413 switch (fid) { 6414 case NVME_TEMPERATURE_THRESHOLD: 6415 result = 0; 6416 6417 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { 6418 break; 6419 } 6420 6421 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) { 6422 result = NVME_TEMPERATURE_WARNING; 6423 } 6424 6425 break; 6426 case NVME_NUMBER_OF_QUEUES: 6427 result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16); 6428 trace_pci_nvme_getfeat_numq(result); 6429 break; 6430 case NVME_INTERRUPT_VECTOR_CONF: 6431 iv = dw11 & 0xffff; 6432 if (iv >= n->conf_ioqpairs + 1) { 6433 return NVME_INVALID_FIELD | NVME_DNR; 6434 } 6435 6436 result = iv; 6437 if (iv == n->admin_cq.vector) { 6438 result |= NVME_INTVC_NOCOALESCING; 6439 } 6440 break; 6441 case NVME_FDP_MODE: 6442 endgrpid = dw11 & 0xff; 6443 6444 if (endgrpid != 0x1) { 6445 return NVME_INVALID_FIELD | NVME_DNR; 6446 } 6447 6448 ret = nvme_get_feature_fdp(n, endgrpid, &result); 6449 if (ret) { 6450 return ret; 6451 } 6452 break; 6453 6454 case NVME_WRITE_ATOMICITY: 6455 result = n->dn; 6456 break; 6457 default: 6458 result = nvme_feature_default[fid]; 6459 break; 6460 } 6461 6462 out: 6463 req->cqe.result = cpu_to_le32(result); 6464 return ret; 6465 } 6466 6467 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) 6468 { 6469 uint16_t ret; 6470 uint64_t timestamp; 6471 6472 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req); 6473 if (ret) { 6474 return ret; 6475 } 6476 6477 nvme_set_timestamp(n, timestamp); 6478 6479 return NVME_SUCCESS; 6480 } 6481 6482 static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns, 6483 NvmeRequest *req) 6484 { 6485 NvmeCmd *cmd = &req->cmd; 6486 uint32_t cdw11 = le32_to_cpu(cmd->cdw11); 6487 uint16_t ph = cdw11 & 0xffff; 6488 uint8_t noet = (cdw11 >> 16) & 0xff; 6489 uint16_t ret, ruhid; 6490 uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1; 6491 uint8_t event_mask = 0; 6492 unsigned int i; 6493 g_autofree uint8_t *events = g_malloc0(noet); 6494 NvmeRuHandle *ruh = NULL; 6495 6496 assert(ns); 6497 6498 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) { 6499 return NVME_FDP_DISABLED | NVME_DNR; 6500 } 6501 6502 if (!nvme_ph_valid(ns, ph)) { 6503 return NVME_INVALID_FIELD | NVME_DNR; 6504 } 6505 6506 ruhid = ns->fdp.phs[ph]; 6507 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid]; 6508 6509 ret = nvme_h2c(n, events, noet, req); 6510 if (ret) { 6511 return ret; 6512 } 6513 6514 for (i = 0; i < noet; i++) { 6515 event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]); 6516 } 6517 6518 if (enable) { 6519 ruh->event_filter |= event_mask; 6520 } else { 6521 ruh->event_filter = ruh->event_filter & ~event_mask; 6522 } 6523 6524 return NVME_SUCCESS; 6525 } 6526 6527 void nvme_atomic_configure_max_write_size(bool dn, uint16_t awun, 6528 uint16_t awupf, NvmeAtomic *atomic) 6529 { 6530 atomic->atomic_max_write_size = (dn ? awupf : awun) + 1; 6531 6532 if (atomic->atomic_max_write_size > 1) { 6533 atomic->atomic_writes = 1; 6534 } 6535 } 6536 6537 static uint16_t nvme_set_feature_write_atomicity(NvmeCtrl *n, NvmeRequest *req) 6538 { 6539 NvmeCmd *cmd = &req->cmd; 6540 6541 uint32_t dw11 = le32_to_cpu(cmd->cdw11); 6542 6543 uint16_t awun = le16_to_cpu(n->id_ctrl.awun); 6544 uint16_t awupf = le16_to_cpu(n->id_ctrl.awupf); 6545 6546 n->dn = dw11 & 0x1; 6547 6548 nvme_atomic_configure_max_write_size(n->dn, awun, awupf, &n->atomic); 6549 6550 for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) { 6551 uint16_t nawun, nawupf, nabsn, nabspf; 6552 6553 NvmeNamespace *ns = nvme_ns(n, i); 6554 if (!ns) { 6555 continue; 6556 } 6557 6558 nawun = le16_to_cpu(ns->id_ns.nawun); 6559 nawupf = le16_to_cpu(ns->id_ns.nawupf); 6560 6561 nvme_atomic_configure_max_write_size(n->dn, nawun, nawupf, 6562 &ns->atomic); 6563 6564 nabsn = le16_to_cpu(ns->id_ns.nabsn); 6565 nabspf = le16_to_cpu(ns->id_ns.nabspf); 6566 6567 nvme_ns_atomic_configure_boundary(n->dn, nabsn, nabspf, 6568 &ns->atomic); 6569 } 6570 6571 return NVME_SUCCESS; 6572 } 6573 6574 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) 6575 { 6576 NvmeNamespace *ns = NULL; 6577 6578 NvmeCmd *cmd = &req->cmd; 6579 uint32_t dw10 = le32_to_cpu(cmd->cdw10); 6580 uint32_t dw11 = le32_to_cpu(cmd->cdw11); 6581 uint32_t nsid = le32_to_cpu(cmd->nsid); 6582 uint8_t fid = NVME_GETSETFEAT_FID(dw10); 6583 uint8_t save = NVME_SETFEAT_SAVE(dw10); 6584 uint16_t status; 6585 int i; 6586 6587 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11); 6588 6589 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) { 6590 return NVME_FID_NOT_SAVEABLE | NVME_DNR; 6591 } 6592 6593 if (!nvme_feature_support[fid]) { 6594 return NVME_INVALID_FIELD | NVME_DNR; 6595 } 6596 6597 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) { 6598 if (nsid != NVME_NSID_BROADCAST) { 6599 if (!nvme_nsid_valid(n, nsid)) { 6600 return NVME_INVALID_NSID | NVME_DNR; 6601 } 6602 6603 ns = nvme_ns(n, nsid); 6604 if (unlikely(!ns)) { 6605 return NVME_INVALID_FIELD | NVME_DNR; 6606 } 6607 } 6608 } else if (nsid && nsid != NVME_NSID_BROADCAST) { 6609 if (!nvme_nsid_valid(n, nsid)) { 6610 return NVME_INVALID_NSID | NVME_DNR; 6611 } 6612 6613 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR; 6614 } 6615 6616 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) { 6617 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; 6618 } 6619 6620 switch (fid) { 6621 case NVME_TEMPERATURE_THRESHOLD: 6622 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { 6623 break; 6624 } 6625 6626 switch (NVME_TEMP_THSEL(dw11)) { 6627 case NVME_TEMP_THSEL_OVER: 6628 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11); 6629 break; 6630 case NVME_TEMP_THSEL_UNDER: 6631 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11); 6632 break; 6633 default: 6634 return NVME_INVALID_FIELD | NVME_DNR; 6635 } 6636 6637 if ((n->temperature >= n->features.temp_thresh_hi) || 6638 (n->temperature <= n->features.temp_thresh_low)) { 6639 nvme_smart_event(n, NVME_SMART_TEMPERATURE); 6640 } 6641 6642 break; 6643 case NVME_ERROR_RECOVERY: 6644 if (nsid == NVME_NSID_BROADCAST) { 6645 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 6646 ns = nvme_ns(n, i); 6647 6648 if (!ns) { 6649 continue; 6650 } 6651 6652 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) { 6653 ns->features.err_rec = dw11; 6654 } 6655 } 6656 6657 break; 6658 } 6659 6660 assert(ns); 6661 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) { 6662 ns->features.err_rec = dw11; 6663 } 6664 break; 6665 case NVME_VOLATILE_WRITE_CACHE: 6666 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 6667 ns = nvme_ns(n, i); 6668 if (!ns) { 6669 continue; 6670 } 6671 6672 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) { 6673 blk_flush(ns->blkconf.blk); 6674 } 6675 6676 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1); 6677 } 6678 6679 break; 6680 6681 case NVME_NUMBER_OF_QUEUES: 6682 if (n->qs_created) { 6683 return NVME_CMD_SEQ_ERROR | NVME_DNR; 6684 } 6685 6686 /* 6687 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR 6688 * and NSQR. 6689 */ 6690 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) { 6691 return NVME_INVALID_FIELD | NVME_DNR; 6692 } 6693 6694 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1, 6695 ((dw11 >> 16) & 0xffff) + 1, 6696 n->conf_ioqpairs, 6697 n->conf_ioqpairs); 6698 req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) | 6699 ((n->conf_ioqpairs - 1) << 16)); 6700 break; 6701 case NVME_ASYNCHRONOUS_EVENT_CONF: 6702 n->features.async_config = dw11; 6703 break; 6704 case NVME_TIMESTAMP: 6705 return nvme_set_feature_timestamp(n, req); 6706 case NVME_HOST_BEHAVIOR_SUPPORT: 6707 status = nvme_h2c(n, (uint8_t *)&n->features.hbs, 6708 sizeof(n->features.hbs), req); 6709 if (status) { 6710 return status; 6711 } 6712 6713 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 6714 ns = nvme_ns(n, i); 6715 6716 if (!ns) { 6717 continue; 6718 } 6719 6720 ns->id_ns.nlbaf = ns->nlbaf - 1; 6721 if (!n->features.hbs.lbafee) { 6722 ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15); 6723 } 6724 } 6725 6726 return status; 6727 case NVME_COMMAND_SET_PROFILE: 6728 if (dw11 & 0x1ff) { 6729 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff); 6730 return NVME_IOCS_COMBINATION_REJECTED | NVME_DNR; 6731 } 6732 break; 6733 case NVME_FDP_MODE: 6734 /* spec: abort with cmd seq err if there's one or more NS' in endgrp */ 6735 return NVME_CMD_SEQ_ERROR | NVME_DNR; 6736 case NVME_FDP_EVENTS: 6737 return nvme_set_feature_fdp_events(n, ns, req); 6738 case NVME_WRITE_ATOMICITY: 6739 return nvme_set_feature_write_atomicity(n, req); 6740 default: 6741 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; 6742 } 6743 return NVME_SUCCESS; 6744 } 6745 6746 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req) 6747 { 6748 trace_pci_nvme_aer(nvme_cid(req)); 6749 6750 if (n->outstanding_aers > n->params.aerl) { 6751 trace_pci_nvme_aer_aerl_exceeded(); 6752 return NVME_AER_LIMIT_EXCEEDED; 6753 } 6754 6755 n->aer_reqs[n->outstanding_aers] = req; 6756 n->outstanding_aers++; 6757 6758 if (!QTAILQ_EMPTY(&n->aer_queue)) { 6759 nvme_process_aers(n); 6760 } 6761 6762 return NVME_NO_COMPLETE; 6763 } 6764 6765 static void nvme_update_dsm_limits(NvmeCtrl *n, NvmeNamespace *ns) 6766 { 6767 if (ns) { 6768 n->dmrsl = 6769 MIN_NON_ZERO(n->dmrsl, BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); 6770 6771 return; 6772 } 6773 6774 for (uint32_t nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) { 6775 ns = nvme_ns(n, nsid); 6776 if (!ns) { 6777 continue; 6778 } 6779 6780 n->dmrsl = 6781 MIN_NON_ZERO(n->dmrsl, BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); 6782 } 6783 } 6784 6785 static bool nvme_csi_supported(NvmeCtrl *n, uint8_t csi) 6786 { 6787 uint32_t cc; 6788 6789 switch (csi) { 6790 case NVME_CSI_NVM: 6791 return true; 6792 6793 case NVME_CSI_ZONED: 6794 cc = ldl_le_p(&n->bar.cc); 6795 6796 return NVME_CC_CSS(cc) == NVME_CC_CSS_ALL; 6797 } 6798 6799 g_assert_not_reached(); 6800 } 6801 6802 static void nvme_detach_ns(NvmeCtrl *n, NvmeNamespace *ns) 6803 { 6804 assert(ns->attached > 0); 6805 6806 n->namespaces[ns->params.nsid] = NULL; 6807 ns->attached--; 6808 } 6809 6810 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req) 6811 { 6812 NvmeNamespace *ns; 6813 NvmeCtrl *ctrl; 6814 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; 6815 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 6816 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 6817 uint8_t sel = dw10 & 0xf; 6818 uint16_t *nr_ids = &list[0]; 6819 uint16_t *ids = &list[1]; 6820 uint16_t ret; 6821 int i; 6822 6823 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf); 6824 6825 if (!nvme_nsid_valid(n, nsid)) { 6826 return NVME_INVALID_NSID | NVME_DNR; 6827 } 6828 6829 ns = nvme_subsys_ns(n->subsys, nsid); 6830 if (!ns) { 6831 return NVME_INVALID_FIELD | NVME_DNR; 6832 } 6833 6834 ret = nvme_h2c(n, (uint8_t *)list, 4096, req); 6835 if (ret) { 6836 return ret; 6837 } 6838 6839 if (!*nr_ids) { 6840 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR; 6841 } 6842 6843 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1); 6844 for (i = 0; i < *nr_ids; i++) { 6845 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]); 6846 if (!ctrl) { 6847 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR; 6848 } 6849 6850 switch (sel) { 6851 case NVME_NS_ATTACHMENT_ATTACH: 6852 if (nvme_ns(ctrl, nsid)) { 6853 return NVME_NS_ALREADY_ATTACHED | NVME_DNR; 6854 } 6855 6856 if (ns->attached && !ns->params.shared) { 6857 return NVME_NS_PRIVATE | NVME_DNR; 6858 } 6859 6860 if (!nvme_csi_supported(ctrl, ns->csi)) { 6861 return NVME_IOCS_NOT_SUPPORTED | NVME_DNR; 6862 } 6863 6864 nvme_attach_ns(ctrl, ns); 6865 nvme_update_dsm_limits(ctrl, ns); 6866 6867 break; 6868 6869 case NVME_NS_ATTACHMENT_DETACH: 6870 if (!nvme_ns(ctrl, nsid)) { 6871 return NVME_NS_NOT_ATTACHED | NVME_DNR; 6872 } 6873 6874 nvme_detach_ns(ctrl, ns); 6875 nvme_update_dsm_limits(ctrl, NULL); 6876 6877 break; 6878 6879 default: 6880 return NVME_INVALID_FIELD | NVME_DNR; 6881 } 6882 6883 /* 6884 * Add namespace id to the changed namespace id list for event clearing 6885 * via Get Log Page command. 6886 */ 6887 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) { 6888 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE, 6889 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED, 6890 NVME_LOG_CHANGED_NSLIST); 6891 } 6892 } 6893 6894 return NVME_SUCCESS; 6895 } 6896 6897 typedef struct NvmeFormatAIOCB { 6898 BlockAIOCB common; 6899 BlockAIOCB *aiocb; 6900 NvmeRequest *req; 6901 int ret; 6902 6903 NvmeNamespace *ns; 6904 uint32_t nsid; 6905 bool broadcast; 6906 int64_t offset; 6907 6908 uint8_t lbaf; 6909 uint8_t mset; 6910 uint8_t pi; 6911 uint8_t pil; 6912 } NvmeFormatAIOCB; 6913 6914 static void nvme_format_cancel(BlockAIOCB *aiocb) 6915 { 6916 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common); 6917 6918 iocb->ret = -ECANCELED; 6919 6920 if (iocb->aiocb) { 6921 blk_aio_cancel_async(iocb->aiocb); 6922 iocb->aiocb = NULL; 6923 } 6924 } 6925 6926 static const AIOCBInfo nvme_format_aiocb_info = { 6927 .aiocb_size = sizeof(NvmeFormatAIOCB), 6928 .cancel_async = nvme_format_cancel, 6929 }; 6930 6931 static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset, 6932 uint8_t pi, uint8_t pil) 6933 { 6934 uint8_t lbafl = lbaf & 0xf; 6935 uint8_t lbafu = lbaf >> 4; 6936 6937 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil); 6938 6939 ns->id_ns.dps = (pil << 3) | pi; 6940 ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl; 6941 6942 nvme_ns_init_format(ns); 6943 } 6944 6945 static void nvme_do_format(NvmeFormatAIOCB *iocb); 6946 6947 static void nvme_format_ns_cb(void *opaque, int ret) 6948 { 6949 NvmeFormatAIOCB *iocb = opaque; 6950 NvmeNamespace *ns = iocb->ns; 6951 int bytes; 6952 6953 if (iocb->ret < 0) { 6954 goto done; 6955 } else if (ret < 0) { 6956 iocb->ret = ret; 6957 goto done; 6958 } 6959 6960 assert(ns); 6961 6962 if (iocb->offset < ns->size) { 6963 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset); 6964 6965 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset, 6966 bytes, BDRV_REQ_MAY_UNMAP, 6967 nvme_format_ns_cb, iocb); 6968 6969 iocb->offset += bytes; 6970 return; 6971 } 6972 6973 nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil); 6974 ns->status = 0x0; 6975 iocb->ns = NULL; 6976 iocb->offset = 0; 6977 6978 done: 6979 nvme_do_format(iocb); 6980 } 6981 6982 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi) 6983 { 6984 if (ns->params.zoned) { 6985 return NVME_INVALID_FORMAT | NVME_DNR; 6986 } 6987 6988 if (lbaf > ns->id_ns.nlbaf) { 6989 return NVME_INVALID_FORMAT | NVME_DNR; 6990 } 6991 6992 if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) { 6993 return NVME_INVALID_FORMAT | NVME_DNR; 6994 } 6995 6996 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) { 6997 return NVME_INVALID_FIELD | NVME_DNR; 6998 } 6999 7000 return NVME_SUCCESS; 7001 } 7002 7003 static void nvme_do_format(NvmeFormatAIOCB *iocb) 7004 { 7005 NvmeRequest *req = iocb->req; 7006 NvmeCtrl *n = nvme_ctrl(req); 7007 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 7008 uint8_t lbaf = dw10 & 0xf; 7009 uint8_t pi = (dw10 >> 5) & 0x7; 7010 uint16_t status; 7011 int i; 7012 7013 if (iocb->ret < 0) { 7014 goto done; 7015 } 7016 7017 if (iocb->broadcast) { 7018 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) { 7019 iocb->ns = nvme_ns(n, i); 7020 if (iocb->ns) { 7021 iocb->nsid = i; 7022 break; 7023 } 7024 } 7025 } 7026 7027 if (!iocb->ns) { 7028 goto done; 7029 } 7030 7031 status = nvme_format_check(iocb->ns, lbaf, pi); 7032 if (status) { 7033 req->status = status; 7034 goto done; 7035 } 7036 7037 iocb->ns->status = NVME_FORMAT_IN_PROGRESS; 7038 nvme_format_ns_cb(iocb, 0); 7039 return; 7040 7041 done: 7042 iocb->common.cb(iocb->common.opaque, iocb->ret); 7043 qemu_aio_unref(iocb); 7044 } 7045 7046 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req) 7047 { 7048 NvmeFormatAIOCB *iocb; 7049 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 7050 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 7051 uint8_t lbaf = dw10 & 0xf; 7052 uint8_t mset = (dw10 >> 4) & 0x1; 7053 uint8_t pi = (dw10 >> 5) & 0x7; 7054 uint8_t pil = (dw10 >> 8) & 0x1; 7055 uint8_t lbafu = (dw10 >> 12) & 0x3; 7056 uint16_t status; 7057 7058 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req); 7059 7060 iocb->req = req; 7061 iocb->ret = 0; 7062 iocb->ns = NULL; 7063 iocb->nsid = 0; 7064 iocb->lbaf = lbaf; 7065 iocb->mset = mset; 7066 iocb->pi = pi; 7067 iocb->pil = pil; 7068 iocb->broadcast = (nsid == NVME_NSID_BROADCAST); 7069 iocb->offset = 0; 7070 7071 if (n->features.hbs.lbafee) { 7072 iocb->lbaf |= lbafu << 4; 7073 } 7074 7075 if (!iocb->broadcast) { 7076 if (!nvme_nsid_valid(n, nsid)) { 7077 status = NVME_INVALID_NSID | NVME_DNR; 7078 goto out; 7079 } 7080 7081 iocb->ns = nvme_ns(n, nsid); 7082 if (!iocb->ns) { 7083 status = NVME_INVALID_FIELD | NVME_DNR; 7084 goto out; 7085 } 7086 } 7087 7088 req->aiocb = &iocb->common; 7089 nvme_do_format(iocb); 7090 7091 return NVME_NO_COMPLETE; 7092 7093 out: 7094 qemu_aio_unref(iocb); 7095 7096 return status; 7097 } 7098 7099 static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total, 7100 int *num_prim, int *num_sec) 7101 { 7102 *num_total = le32_to_cpu(rt ? 7103 n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt); 7104 *num_prim = le16_to_cpu(rt ? 7105 n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap); 7106 *num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa); 7107 } 7108 7109 static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req, 7110 uint16_t cntlid, uint8_t rt, 7111 int nr) 7112 { 7113 int num_total, num_prim, num_sec; 7114 7115 if (cntlid != n->cntlid) { 7116 return NVME_INVALID_CTRL_ID | NVME_DNR; 7117 } 7118 7119 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec); 7120 7121 if (nr > num_total) { 7122 return NVME_INVALID_NUM_RESOURCES | NVME_DNR; 7123 } 7124 7125 if (nr > num_total - num_sec) { 7126 return NVME_INVALID_RESOURCE_ID | NVME_DNR; 7127 } 7128 7129 if (rt) { 7130 n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr); 7131 } else { 7132 n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr); 7133 } 7134 7135 req->cqe.result = cpu_to_le32(nr); 7136 return req->status; 7137 } 7138 7139 static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl, 7140 uint8_t rt, int nr) 7141 { 7142 int prev_nr, prev_total; 7143 7144 if (rt) { 7145 prev_nr = le16_to_cpu(sctrl->nvi); 7146 prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa); 7147 sctrl->nvi = cpu_to_le16(nr); 7148 n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr); 7149 } else { 7150 prev_nr = le16_to_cpu(sctrl->nvq); 7151 prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa); 7152 sctrl->nvq = cpu_to_le16(nr); 7153 n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr); 7154 } 7155 } 7156 7157 static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req, 7158 uint16_t cntlid, uint8_t rt, int nr) 7159 { 7160 int num_total, num_prim, num_sec, num_free, diff, limit; 7161 NvmeSecCtrlEntry *sctrl; 7162 7163 sctrl = nvme_sctrl_for_cntlid(n, cntlid); 7164 if (!sctrl) { 7165 return NVME_INVALID_CTRL_ID | NVME_DNR; 7166 } 7167 7168 if (sctrl->scs) { 7169 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR; 7170 } 7171 7172 limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm); 7173 if (nr > limit) { 7174 return NVME_INVALID_NUM_RESOURCES | NVME_DNR; 7175 } 7176 7177 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec); 7178 num_free = num_total - num_prim - num_sec; 7179 diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq); 7180 7181 if (diff > num_free) { 7182 return NVME_INVALID_RESOURCE_ID | NVME_DNR; 7183 } 7184 7185 nvme_update_virt_res(n, sctrl, rt, nr); 7186 req->cqe.result = cpu_to_le32(nr); 7187 7188 return req->status; 7189 } 7190 7191 static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online) 7192 { 7193 PCIDevice *pci = PCI_DEVICE(n); 7194 NvmeCtrl *sn = NULL; 7195 NvmeSecCtrlEntry *sctrl; 7196 int vf_index; 7197 7198 sctrl = nvme_sctrl_for_cntlid(n, cntlid); 7199 if (!sctrl) { 7200 return NVME_INVALID_CTRL_ID | NVME_DNR; 7201 } 7202 7203 if (!pci_is_vf(pci)) { 7204 vf_index = le16_to_cpu(sctrl->vfn) - 1; 7205 sn = NVME(pcie_sriov_get_vf_at_index(pci, vf_index)); 7206 } 7207 7208 if (online) { 7209 if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) { 7210 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR; 7211 } 7212 7213 if (!sctrl->scs) { 7214 sctrl->scs = 0x1; 7215 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION); 7216 } 7217 } else { 7218 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_INTERRUPT, 0); 7219 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_QUEUE, 0); 7220 7221 if (sctrl->scs) { 7222 sctrl->scs = 0x0; 7223 if (sn) { 7224 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION); 7225 } 7226 } 7227 } 7228 7229 return NVME_SUCCESS; 7230 } 7231 7232 static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req) 7233 { 7234 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 7235 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11); 7236 uint8_t act = dw10 & 0xf; 7237 uint8_t rt = (dw10 >> 8) & 0x7; 7238 uint16_t cntlid = (dw10 >> 16) & 0xffff; 7239 int nr = dw11 & 0xffff; 7240 7241 trace_pci_nvme_virt_mngmt(nvme_cid(req), act, cntlid, rt ? "VI" : "VQ", nr); 7242 7243 if (rt != NVME_VIRT_RES_QUEUE && rt != NVME_VIRT_RES_INTERRUPT) { 7244 return NVME_INVALID_RESOURCE_ID | NVME_DNR; 7245 } 7246 7247 switch (act) { 7248 case NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN: 7249 return nvme_assign_virt_res_to_sec(n, req, cntlid, rt, nr); 7250 case NVME_VIRT_MNGMT_ACTION_PRM_ALLOC: 7251 return nvme_assign_virt_res_to_prim(n, req, cntlid, rt, nr); 7252 case NVME_VIRT_MNGMT_ACTION_SEC_ONLINE: 7253 return nvme_virt_set_state(n, cntlid, true); 7254 case NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE: 7255 return nvme_virt_set_state(n, cntlid, false); 7256 default: 7257 return NVME_INVALID_FIELD | NVME_DNR; 7258 } 7259 } 7260 7261 static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) 7262 { 7263 PCIDevice *pci = PCI_DEVICE(n); 7264 uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); 7265 uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); 7266 int i; 7267 7268 /* Address should be page aligned */ 7269 if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) { 7270 return NVME_INVALID_FIELD | NVME_DNR; 7271 } 7272 7273 /* Save shadow buffer base addr for use during queue creation */ 7274 n->dbbuf_dbs = dbs_addr; 7275 n->dbbuf_eis = eis_addr; 7276 n->dbbuf_enabled = true; 7277 7278 for (i = 0; i < n->params.max_ioqpairs + 1; i++) { 7279 NvmeSQueue *sq = n->sq[i]; 7280 NvmeCQueue *cq = n->cq[i]; 7281 7282 if (sq) { 7283 /* 7284 * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3) 7285 * nvme_process_db() uses this hard-coded way to calculate 7286 * doorbell offsets. Be consistent with that here. 7287 */ 7288 sq->db_addr = dbs_addr + (i << 3); 7289 sq->ei_addr = eis_addr + (i << 3); 7290 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED); 7291 7292 if (n->params.ioeventfd && sq->sqid != 0) { 7293 if (!nvme_init_sq_ioeventfd(sq)) { 7294 sq->ioeventfd_enabled = true; 7295 } 7296 } 7297 } 7298 7299 if (cq) { 7300 /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */ 7301 cq->db_addr = dbs_addr + (i << 3) + (1 << 2); 7302 cq->ei_addr = eis_addr + (i << 3) + (1 << 2); 7303 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED); 7304 7305 if (n->params.ioeventfd && cq->cqid != 0) { 7306 if (!nvme_init_cq_ioeventfd(cq)) { 7307 cq->ioeventfd_enabled = true; 7308 } 7309 } 7310 } 7311 } 7312 7313 trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr); 7314 7315 return NVME_SUCCESS; 7316 } 7317 7318 static uint16_t nvme_sec_prot_spdm_send(NvmeCtrl *n, NvmeRequest *req) 7319 { 7320 StorageSpdmTransportHeader hdr = {0}; 7321 g_autofree uint8_t *sec_buf = NULL; 7322 uint32_t transfer_len = le32_to_cpu(req->cmd.cdw11); 7323 uint32_t transport_transfer_len = transfer_len; 7324 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 7325 uint32_t recvd; 7326 uint16_t nvme_cmd_status, ret; 7327 uint8_t secp = extract32(dw10, 24, 8); 7328 uint16_t spsp = extract32(dw10, 8, 16); 7329 bool spdm_res; 7330 7331 if (transport_transfer_len > UINT32_MAX - sizeof(hdr)) { 7332 return NVME_INVALID_FIELD | NVME_DNR; 7333 } 7334 7335 transport_transfer_len += sizeof(hdr); 7336 if (transport_transfer_len > SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE) { 7337 return NVME_INVALID_FIELD | NVME_DNR; 7338 } 7339 7340 ret = nvme_check_mdts(n, transport_transfer_len); 7341 if (ret != NVME_SUCCESS) { 7342 return ret; 7343 } 7344 7345 /* Generate the NVMe transport header */ 7346 hdr.security_protocol = secp; 7347 hdr.security_protocol_specific = cpu_to_le16(spsp); 7348 hdr.length = cpu_to_le32(transfer_len); 7349 7350 sec_buf = g_try_malloc0(transport_transfer_len); 7351 if (!sec_buf) { 7352 return NVME_INTERNAL_DEV_ERROR; 7353 } 7354 7355 /* Attach the transport header */ 7356 memcpy(sec_buf, &hdr, sizeof(hdr)); 7357 ret = nvme_h2c(n, sec_buf + sizeof(hdr), transfer_len, req); 7358 if (ret) { 7359 return ret; 7360 } 7361 7362 spdm_res = spdm_socket_send(n->spdm_socket, SPDM_SOCKET_STORAGE_CMD_IF_SEND, 7363 SPDM_SOCKET_TRANSPORT_TYPE_NVME, sec_buf, 7364 transport_transfer_len); 7365 if (!spdm_res) { 7366 return NVME_DATA_TRAS_ERROR | NVME_DNR; 7367 } 7368 7369 /* The responder shall ack with message status */ 7370 recvd = spdm_socket_receive(n->spdm_socket, SPDM_SOCKET_TRANSPORT_TYPE_NVME, 7371 &nvme_cmd_status, 7372 SPDM_SOCKET_MAX_MSG_STATUS_LEN); 7373 7374 nvme_cmd_status = be16_to_cpu(nvme_cmd_status); 7375 7376 if (recvd < SPDM_SOCKET_MAX_MSG_STATUS_LEN) { 7377 return NVME_DATA_TRAS_ERROR | NVME_DNR; 7378 } 7379 7380 return nvme_cmd_status; 7381 } 7382 7383 /* From host to controller */ 7384 static uint16_t nvme_security_send(NvmeCtrl *n, NvmeRequest *req) 7385 { 7386 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 7387 uint8_t secp = extract32(dw10, 24, 8); 7388 7389 switch (secp) { 7390 case NVME_SEC_PROT_DMTF_SPDM: 7391 if (n->spdm_socket < 0) { 7392 return NVME_INVALID_FIELD | NVME_DNR; 7393 } 7394 return nvme_sec_prot_spdm_send(n, req); 7395 default: 7396 /* Unsupported Security Protocol Type */ 7397 return NVME_INVALID_FIELD | NVME_DNR; 7398 } 7399 7400 return NVME_INVALID_FIELD | NVME_DNR; 7401 } 7402 7403 static uint16_t nvme_sec_prot_spdm_receive(NvmeCtrl *n, NvmeRequest *req) 7404 { 7405 StorageSpdmTransportHeader hdr; 7406 g_autofree uint8_t *rsp_spdm_buf = NULL; 7407 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 7408 uint32_t alloc_len = le32_to_cpu(req->cmd.cdw11); 7409 uint32_t recvd, spdm_res; 7410 uint16_t nvme_cmd_status, ret; 7411 uint8_t secp = extract32(dw10, 24, 8); 7412 uint8_t spsp = extract32(dw10, 8, 16); 7413 if (!alloc_len) { 7414 return NVME_INVALID_FIELD | NVME_DNR; 7415 } 7416 7417 /* Generate the NVMe transport header */ 7418 hdr = (StorageSpdmTransportHeader) { 7419 .security_protocol = secp, 7420 .security_protocol_specific = cpu_to_le16(spsp), 7421 .length = cpu_to_le32(alloc_len), 7422 }; 7423 7424 /* Forward if_recv to the SPDM Server with SPSP0 */ 7425 spdm_res = spdm_socket_send(n->spdm_socket, SPDM_SOCKET_STORAGE_CMD_IF_RECV, 7426 SPDM_SOCKET_TRANSPORT_TYPE_NVME, 7427 &hdr, sizeof(hdr)); 7428 if (!spdm_res) { 7429 return NVME_DATA_TRAS_ERROR | NVME_DNR; 7430 } 7431 7432 /* The responder shall ack with message status */ 7433 recvd = spdm_socket_receive(n->spdm_socket, SPDM_SOCKET_TRANSPORT_TYPE_NVME, 7434 &nvme_cmd_status, 7435 SPDM_SOCKET_MAX_MSG_STATUS_LEN); 7436 if (recvd < SPDM_SOCKET_MAX_MSG_STATUS_LEN) { 7437 return NVME_DATA_TRAS_ERROR | NVME_DNR; 7438 } 7439 7440 nvme_cmd_status = be16_to_cpu(nvme_cmd_status); 7441 /* An error here implies the prior if_recv from requester was spurious */ 7442 if (nvme_cmd_status != NVME_SUCCESS) { 7443 return nvme_cmd_status; 7444 } 7445 7446 /* Clear to start receiving data from the server */ 7447 rsp_spdm_buf = g_try_malloc0(alloc_len); 7448 if (!rsp_spdm_buf) { 7449 return NVME_INTERNAL_DEV_ERROR; 7450 } 7451 7452 recvd = spdm_socket_receive(n->spdm_socket, 7453 SPDM_SOCKET_TRANSPORT_TYPE_NVME, 7454 rsp_spdm_buf, alloc_len); 7455 if (!recvd) { 7456 return NVME_DATA_TRAS_ERROR | NVME_DNR; 7457 } 7458 7459 ret = nvme_c2h(n, rsp_spdm_buf, MIN(recvd, alloc_len), req); 7460 if (ret) { 7461 return ret; 7462 } 7463 7464 return NVME_SUCCESS; 7465 } 7466 7467 static uint16_t nvme_get_sec_prot_info(NvmeCtrl *n, NvmeRequest *req) 7468 { 7469 uint32_t alloc_len = le32_to_cpu(req->cmd.cdw11); 7470 uint8_t resp[10] = { 7471 /* Support Security Protol List Length */ 7472 [6] = 0, /* MSB */ 7473 [7] = 2, /* LSB */ 7474 /* Support Security Protocol List */ 7475 [8] = SFSC_SECURITY_PROT_INFO, 7476 [9] = 0, 7477 }; 7478 7479 if (n->spdm_socket >= 0) { 7480 resp[9] = NVME_SEC_PROT_DMTF_SPDM; 7481 } 7482 7483 if (alloc_len < 10) { 7484 return NVME_INVALID_FIELD | NVME_DNR; 7485 } 7486 7487 return nvme_c2h(n, resp, sizeof(resp), req); 7488 } 7489 7490 /* From controller to host */ 7491 static uint16_t nvme_security_receive(NvmeCtrl *n, NvmeRequest *req) 7492 { 7493 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 7494 uint16_t spsp = extract32(dw10, 8, 16); 7495 uint8_t secp = extract32(dw10, 24, 8); 7496 7497 switch (secp) { 7498 case SFSC_SECURITY_PROT_INFO: 7499 switch (spsp) { 7500 case 0: 7501 /* Supported security protocol list */ 7502 return nvme_get_sec_prot_info(n, req); 7503 case 1: 7504 /* Certificate data */ 7505 /* fallthrough */ 7506 default: 7507 return NVME_INVALID_FIELD | NVME_DNR; 7508 } 7509 case NVME_SEC_PROT_DMTF_SPDM: 7510 if (n->spdm_socket < 0) { 7511 return NVME_INVALID_FIELD | NVME_DNR; 7512 } 7513 return nvme_sec_prot_spdm_receive(n, req); 7514 default: 7515 return NVME_INVALID_FIELD | NVME_DNR; 7516 } 7517 } 7518 7519 static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req) 7520 { 7521 return NVME_INVALID_FIELD | NVME_DNR; 7522 } 7523 7524 static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req) 7525 { 7526 NvmeNamespace *ns; 7527 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 7528 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11); 7529 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 7530 uint8_t doper, dtype; 7531 uint32_t numd, trans_len; 7532 NvmeDirectiveIdentify id = { 7533 .supported = 1 << NVME_DIRECTIVE_IDENTIFY, 7534 .enabled = 1 << NVME_DIRECTIVE_IDENTIFY, 7535 }; 7536 7537 numd = dw10 + 1; 7538 doper = dw11 & 0xff; 7539 dtype = (dw11 >> 8) & 0xff; 7540 7541 trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2); 7542 7543 if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY || 7544 doper != NVME_DIRECTIVE_RETURN_PARAMS) { 7545 return NVME_INVALID_FIELD | NVME_DNR; 7546 } 7547 7548 ns = nvme_ns(n, nsid); 7549 if (!ns) { 7550 return NVME_INVALID_FIELD | NVME_DNR; 7551 } 7552 7553 switch (dtype) { 7554 case NVME_DIRECTIVE_IDENTIFY: 7555 switch (doper) { 7556 case NVME_DIRECTIVE_RETURN_PARAMS: 7557 if (ns->endgrp && ns->endgrp->fdp.enabled) { 7558 id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT; 7559 id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT; 7560 id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT; 7561 } 7562 7563 return nvme_c2h(n, (uint8_t *)&id, trans_len, req); 7564 7565 default: 7566 return NVME_INVALID_FIELD | NVME_DNR; 7567 } 7568 7569 default: 7570 return NVME_INVALID_FIELD; 7571 } 7572 } 7573 7574 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) 7575 { 7576 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, 7577 nvme_adm_opc_str(req->cmd.opcode)); 7578 7579 if (!(n->cse.acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { 7580 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode); 7581 return NVME_INVALID_OPCODE | NVME_DNR; 7582 } 7583 7584 /* SGLs shall not be used for Admin commands in NVMe over PCIe */ 7585 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) { 7586 return NVME_INVALID_FIELD | NVME_DNR; 7587 } 7588 7589 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) { 7590 return NVME_INVALID_FIELD; 7591 } 7592 7593 switch (req->cmd.opcode) { 7594 case NVME_ADM_CMD_DELETE_SQ: 7595 return nvme_del_sq(n, req); 7596 case NVME_ADM_CMD_CREATE_SQ: 7597 return nvme_create_sq(n, req); 7598 case NVME_ADM_CMD_GET_LOG_PAGE: 7599 return nvme_get_log(n, req); 7600 case NVME_ADM_CMD_DELETE_CQ: 7601 return nvme_del_cq(n, req); 7602 case NVME_ADM_CMD_CREATE_CQ: 7603 return nvme_create_cq(n, req); 7604 case NVME_ADM_CMD_IDENTIFY: 7605 return nvme_identify(n, req); 7606 case NVME_ADM_CMD_ABORT: 7607 return nvme_abort(n, req); 7608 case NVME_ADM_CMD_SET_FEATURES: 7609 return nvme_set_feature(n, req); 7610 case NVME_ADM_CMD_GET_FEATURES: 7611 return nvme_get_feature(n, req); 7612 case NVME_ADM_CMD_ASYNC_EV_REQ: 7613 return nvme_aer(n, req); 7614 case NVME_ADM_CMD_NS_ATTACHMENT: 7615 return nvme_ns_attachment(n, req); 7616 case NVME_ADM_CMD_VIRT_MNGMT: 7617 return nvme_virt_mngmt(n, req); 7618 case NVME_ADM_CMD_DBBUF_CONFIG: 7619 return nvme_dbbuf_config(n, req); 7620 case NVME_ADM_CMD_FORMAT_NVM: 7621 return nvme_format(n, req); 7622 case NVME_ADM_CMD_DIRECTIVE_SEND: 7623 return nvme_directive_send(n, req); 7624 case NVME_ADM_CMD_DIRECTIVE_RECV: 7625 return nvme_directive_receive(n, req); 7626 case NVME_ADM_CMD_SECURITY_SEND: 7627 return nvme_security_send(n, req); 7628 case NVME_ADM_CMD_SECURITY_RECV: 7629 return nvme_security_receive(n, req); 7630 default: 7631 g_assert_not_reached(); 7632 } 7633 7634 return NVME_INVALID_OPCODE | NVME_DNR; 7635 } 7636 7637 static void nvme_update_sq_eventidx(const NvmeSQueue *sq) 7638 { 7639 trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail); 7640 7641 stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail, 7642 MEMTXATTRS_UNSPECIFIED); 7643 } 7644 7645 static void nvme_update_sq_tail(NvmeSQueue *sq) 7646 { 7647 ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, &sq->tail, 7648 MEMTXATTRS_UNSPECIFIED); 7649 7650 trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail); 7651 } 7652 7653 static int nvme_atomic_boundary_check(NvmeCtrl *n, NvmeCmd *cmd, 7654 NvmeAtomic *atomic) 7655 { 7656 NvmeRwCmd *rw = (NvmeRwCmd *)cmd; 7657 7658 if (atomic->atomic_boundary) { 7659 uint64_t slba = le64_to_cpu(rw->slba); 7660 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb); 7661 uint64_t elba = slba + nlb; 7662 uint64_t imask; 7663 7664 if ((slba < atomic->atomic_nabo) || (elba < atomic->atomic_nabo)) { 7665 return 0; 7666 } 7667 7668 /* Update slba/elba based on boundary offset */ 7669 slba = slba - atomic->atomic_nabo; 7670 elba = slba + nlb; 7671 7672 imask = ~(atomic->atomic_boundary - 1); 7673 if ((slba & imask) != (elba & imask)) { 7674 /* 7675 * The write crosses an atomic boundary and the controller provides 7676 * no atomicity guarantees unless AWUN/AWUPF are non-zero. 7677 */ 7678 if (n->atomic.atomic_max_write_size && 7679 ((nlb + 1) <= n->atomic.atomic_max_write_size)) { 7680 return 1; 7681 } 7682 return 0; 7683 } 7684 } 7685 return 1; 7686 } 7687 #define NVME_ATOMIC_NO_START 0 7688 #define NVME_ATOMIC_START_ATOMIC 1 7689 #define NVME_ATOMIC_START_NONATOMIC 2 7690 7691 static int nvme_atomic_write_check(NvmeCtrl *n, NvmeCmd *cmd, 7692 NvmeAtomic *atomic) 7693 { 7694 NvmeRwCmd *rw = (NvmeRwCmd *)cmd; 7695 uint64_t slba = le64_to_cpu(rw->slba); 7696 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb); 7697 uint64_t elba = slba + nlb; 7698 bool cmd_atomic_wr = true; 7699 int i; 7700 7701 if ((cmd->opcode == NVME_CMD_READ) || ((cmd->opcode == NVME_CMD_WRITE) && 7702 ((rw->nlb + 1) > atomic->atomic_max_write_size))) { 7703 cmd_atomic_wr = false; 7704 } 7705 7706 /* 7707 * Check if a write crosses an atomic boundary. 7708 */ 7709 if (cmd->opcode == NVME_CMD_WRITE) { 7710 if (!nvme_atomic_boundary_check(n, cmd, atomic)) { 7711 cmd_atomic_wr = false; 7712 } 7713 } 7714 7715 /* 7716 * Walk the queues to see if there are any atomic conflicts. 7717 */ 7718 for (i = 1; i < n->params.max_ioqpairs + 1; i++) { 7719 NvmeSQueue *sq; 7720 NvmeRequest *req; 7721 NvmeRwCmd *req_rw; 7722 uint64_t req_slba; 7723 uint32_t req_nlb; 7724 uint64_t req_elba; 7725 7726 sq = n->sq[i]; 7727 if (!sq) { 7728 continue; 7729 } 7730 7731 /* 7732 * Walk all the requests on a given queue. 7733 */ 7734 QTAILQ_FOREACH(req, &sq->out_req_list, entry) { 7735 req_rw = (NvmeRwCmd *)&req->cmd; 7736 7737 if (((req_rw->opcode == NVME_CMD_WRITE) || 7738 (req_rw->opcode == NVME_CMD_READ)) && 7739 (cmd->nsid == req->ns->params.nsid)) { 7740 req_slba = le64_to_cpu(req_rw->slba); 7741 req_nlb = (uint32_t)le16_to_cpu(req_rw->nlb); 7742 req_elba = req_slba + req_nlb; 7743 7744 if (cmd_atomic_wr) { 7745 if ((elba >= req_slba) && (slba <= req_elba)) { 7746 return NVME_ATOMIC_NO_START; 7747 } 7748 } else { 7749 if (req->atomic_write && ((elba >= req_slba) && 7750 (slba <= req_elba))) { 7751 return NVME_ATOMIC_NO_START; 7752 } 7753 } 7754 } 7755 } 7756 } 7757 if (cmd_atomic_wr) { 7758 return NVME_ATOMIC_START_ATOMIC; 7759 } 7760 return NVME_ATOMIC_START_NONATOMIC; 7761 } 7762 7763 static NvmeAtomic *nvme_get_atomic(NvmeCtrl *n, NvmeCmd *cmd) 7764 { 7765 NvmeNamespace *ns = nvme_ns(n, cmd->nsid); 7766 7767 if (ns && ns->atomic.atomic_writes) { 7768 return &ns->atomic; 7769 } 7770 7771 if (n->atomic.atomic_writes) { 7772 return &n->atomic; 7773 } 7774 return NULL; 7775 } 7776 7777 static void nvme_process_sq(void *opaque) 7778 { 7779 NvmeSQueue *sq = opaque; 7780 NvmeCtrl *n = sq->ctrl; 7781 NvmeCQueue *cq = n->cq[sq->cqid]; 7782 7783 uint16_t status; 7784 hwaddr addr; 7785 NvmeCmd cmd; 7786 NvmeRequest *req; 7787 7788 if (n->dbbuf_enabled) { 7789 nvme_update_sq_tail(sq); 7790 } 7791 7792 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) { 7793 NvmeAtomic *atomic; 7794 bool cmd_is_atomic; 7795 7796 addr = sq->dma_addr + (sq->head << NVME_SQES); 7797 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) { 7798 trace_pci_nvme_err_addr_read(addr); 7799 trace_pci_nvme_err_cfs(); 7800 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED); 7801 break; 7802 } 7803 7804 atomic = nvme_get_atomic(n, &cmd); 7805 7806 cmd_is_atomic = false; 7807 if (sq->sqid && atomic) { 7808 int ret; 7809 7810 ret = nvme_atomic_write_check(n, &cmd, atomic); 7811 switch (ret) { 7812 case NVME_ATOMIC_NO_START: 7813 qemu_bh_schedule(sq->bh); 7814 return; 7815 case NVME_ATOMIC_START_ATOMIC: 7816 cmd_is_atomic = true; 7817 break; 7818 case NVME_ATOMIC_START_NONATOMIC: 7819 default: 7820 break; 7821 } 7822 } 7823 nvme_inc_sq_head(sq); 7824 7825 req = QTAILQ_FIRST(&sq->req_list); 7826 QTAILQ_REMOVE(&sq->req_list, req, entry); 7827 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry); 7828 nvme_req_clear(req); 7829 req->cqe.cid = cmd.cid; 7830 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd)); 7831 7832 if (sq->sqid && atomic) { 7833 req->atomic_write = cmd_is_atomic; 7834 } 7835 7836 status = sq->sqid ? nvme_io_cmd(n, req) : 7837 nvme_admin_cmd(n, req); 7838 if (status != NVME_NO_COMPLETE) { 7839 req->status = status; 7840 nvme_enqueue_req_completion(cq, req); 7841 } 7842 7843 if (n->dbbuf_enabled) { 7844 nvme_update_sq_eventidx(sq); 7845 nvme_update_sq_tail(sq); 7846 } 7847 } 7848 } 7849 7850 static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size) 7851 { 7852 uint8_t *config; 7853 7854 if (!msix_present(pci_dev)) { 7855 return; 7856 } 7857 7858 assert(table_size > 0 && table_size <= pci_dev->msix_entries_nr); 7859 7860 config = pci_dev->config + pci_dev->msix_cap; 7861 pci_set_word_by_mask(config + PCI_MSIX_FLAGS, PCI_MSIX_FLAGS_QSIZE, 7862 table_size - 1); 7863 } 7864 7865 static void nvme_activate_virt_res(NvmeCtrl *n) 7866 { 7867 PCIDevice *pci_dev = PCI_DEVICE(n); 7868 NvmePriCtrlCap *cap = &n->pri_ctrl_cap; 7869 NvmeSecCtrlEntry *sctrl; 7870 7871 /* -1 to account for the admin queue */ 7872 if (pci_is_vf(pci_dev)) { 7873 sctrl = nvme_sctrl(n); 7874 cap->vqprt = sctrl->nvq; 7875 cap->viprt = sctrl->nvi; 7876 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0; 7877 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1; 7878 } else { 7879 cap->vqrfap = n->next_pri_ctrl_cap.vqrfap; 7880 cap->virfap = n->next_pri_ctrl_cap.virfap; 7881 n->conf_ioqpairs = le16_to_cpu(cap->vqprt) + 7882 le16_to_cpu(cap->vqrfap) - 1; 7883 n->conf_msix_qsize = le16_to_cpu(cap->viprt) + 7884 le16_to_cpu(cap->virfap); 7885 } 7886 } 7887 7888 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst) 7889 { 7890 PCIDevice *pci_dev = PCI_DEVICE(n); 7891 NvmeSecCtrlEntry *sctrl; 7892 NvmeNamespace *ns; 7893 int i; 7894 7895 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 7896 ns = nvme_ns(n, i); 7897 if (!ns) { 7898 continue; 7899 } 7900 7901 nvme_ns_drain(ns); 7902 } 7903 7904 for (i = 0; i < n->params.max_ioqpairs + 1; i++) { 7905 if (n->sq[i] != NULL) { 7906 nvme_free_sq(n->sq[i], n); 7907 } 7908 } 7909 for (i = 0; i < n->params.max_ioqpairs + 1; i++) { 7910 if (n->cq[i] != NULL) { 7911 nvme_free_cq(n->cq[i], n); 7912 } 7913 } 7914 7915 while (!QTAILQ_EMPTY(&n->aer_queue)) { 7916 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue); 7917 QTAILQ_REMOVE(&n->aer_queue, event, entry); 7918 g_free(event); 7919 } 7920 7921 if (n->params.sriov_max_vfs) { 7922 if (!pci_is_vf(pci_dev)) { 7923 for (i = 0; i < n->nr_sec_ctrls; i++) { 7924 sctrl = &n->sec_ctrl_list[i]; 7925 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); 7926 } 7927 } 7928 7929 if (rst != NVME_RESET_CONTROLLER) { 7930 nvme_activate_virt_res(n); 7931 } 7932 } 7933 7934 n->aer_queued = 0; 7935 n->aer_mask = 0; 7936 n->outstanding_aers = 0; 7937 n->qs_created = false; 7938 7939 n->dn = n->params.atomic_dn; /* Set Disable Normal */ 7940 7941 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize); 7942 7943 if (pci_is_vf(pci_dev)) { 7944 sctrl = nvme_sctrl(n); 7945 7946 stl_le_p(&n->bar.csts, sctrl->scs ? 0 : NVME_CSTS_FAILED); 7947 } else { 7948 stl_le_p(&n->bar.csts, 0); 7949 } 7950 7951 stl_le_p(&n->bar.intms, 0); 7952 stl_le_p(&n->bar.intmc, 0); 7953 stl_le_p(&n->bar.cc, 0); 7954 7955 n->dbbuf_dbs = 0; 7956 n->dbbuf_eis = 0; 7957 n->dbbuf_enabled = false; 7958 } 7959 7960 static void nvme_ctrl_shutdown(NvmeCtrl *n) 7961 { 7962 NvmeNamespace *ns; 7963 int i; 7964 7965 if (n->pmr.dev) { 7966 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size); 7967 } 7968 7969 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 7970 ns = nvme_ns(n, i); 7971 if (!ns) { 7972 continue; 7973 } 7974 7975 nvme_ns_shutdown(ns); 7976 } 7977 } 7978 7979 static int nvme_start_ctrl(NvmeCtrl *n) 7980 { 7981 uint64_t cap = ldq_le_p(&n->bar.cap); 7982 uint32_t cc = ldl_le_p(&n->bar.cc); 7983 uint32_t aqa = ldl_le_p(&n->bar.aqa); 7984 uint64_t asq = ldq_le_p(&n->bar.asq); 7985 uint64_t acq = ldq_le_p(&n->bar.acq); 7986 uint32_t page_bits = NVME_CC_MPS(cc) + 12; 7987 uint32_t page_size = 1 << page_bits; 7988 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n); 7989 7990 if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) { 7991 trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi), 7992 le16_to_cpu(sctrl->nvq)); 7993 return -1; 7994 } 7995 if (unlikely(n->cq[0])) { 7996 trace_pci_nvme_err_startfail_cq(); 7997 return -1; 7998 } 7999 if (unlikely(n->sq[0])) { 8000 trace_pci_nvme_err_startfail_sq(); 8001 return -1; 8002 } 8003 if (unlikely(asq & (page_size - 1))) { 8004 trace_pci_nvme_err_startfail_asq_misaligned(asq); 8005 return -1; 8006 } 8007 if (unlikely(acq & (page_size - 1))) { 8008 trace_pci_nvme_err_startfail_acq_misaligned(acq); 8009 return -1; 8010 } 8011 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) { 8012 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc)); 8013 return -1; 8014 } 8015 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) { 8016 trace_pci_nvme_err_startfail_page_too_small( 8017 NVME_CC_MPS(cc), 8018 NVME_CAP_MPSMIN(cap)); 8019 return -1; 8020 } 8021 if (unlikely(NVME_CC_MPS(cc) > 8022 NVME_CAP_MPSMAX(cap))) { 8023 trace_pci_nvme_err_startfail_page_too_large( 8024 NVME_CC_MPS(cc), 8025 NVME_CAP_MPSMAX(cap)); 8026 return -1; 8027 } 8028 if (unlikely(!NVME_AQA_ASQS(aqa))) { 8029 trace_pci_nvme_err_startfail_asqent_sz_zero(); 8030 return -1; 8031 } 8032 if (unlikely(!NVME_AQA_ACQS(aqa))) { 8033 trace_pci_nvme_err_startfail_acqent_sz_zero(); 8034 return -1; 8035 } 8036 8037 n->page_bits = page_bits; 8038 n->page_size = page_size; 8039 n->max_prp_ents = n->page_size / sizeof(uint64_t); 8040 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1); 8041 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1); 8042 8043 nvme_set_timestamp(n, 0ULL); 8044 8045 /* verify that the command sets of attached namespaces are supported */ 8046 for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) { 8047 NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i); 8048 8049 if (!ns || (!ns->params.shared && ns->ctrl != n)) { 8050 continue; 8051 } 8052 8053 if (nvme_csi_supported(n, ns->csi) && !ns->params.detached) { 8054 if (!ns->attached || ns->params.shared) { 8055 nvme_attach_ns(n, ns); 8056 } 8057 } 8058 } 8059 8060 nvme_update_dsm_limits(n, NULL); 8061 8062 return 0; 8063 } 8064 8065 static void nvme_cmb_enable_regs(NvmeCtrl *n) 8066 { 8067 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc); 8068 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz); 8069 8070 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1); 8071 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1); 8072 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR); 8073 stl_le_p(&n->bar.cmbloc, cmbloc); 8074 8075 NVME_CMBSZ_SET_SQS(cmbsz, 1); 8076 NVME_CMBSZ_SET_CQS(cmbsz, 0); 8077 NVME_CMBSZ_SET_LISTS(cmbsz, 1); 8078 NVME_CMBSZ_SET_RDS(cmbsz, 1); 8079 NVME_CMBSZ_SET_WDS(cmbsz, 1); 8080 NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */ 8081 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb); 8082 stl_le_p(&n->bar.cmbsz, cmbsz); 8083 } 8084 8085 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, 8086 unsigned size) 8087 { 8088 PCIDevice *pci = PCI_DEVICE(n); 8089 uint64_t cap = ldq_le_p(&n->bar.cap); 8090 uint32_t cc = ldl_le_p(&n->bar.cc); 8091 uint32_t intms = ldl_le_p(&n->bar.intms); 8092 uint32_t csts = ldl_le_p(&n->bar.csts); 8093 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts); 8094 8095 if (unlikely(offset & (sizeof(uint32_t) - 1))) { 8096 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32, 8097 "MMIO write not 32-bit aligned," 8098 " offset=0x%"PRIx64"", offset); 8099 /* should be ignored, fall through for now */ 8100 } 8101 8102 if (unlikely(size < sizeof(uint32_t))) { 8103 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall, 8104 "MMIO write smaller than 32-bits," 8105 " offset=0x%"PRIx64", size=%u", 8106 offset, size); 8107 /* should be ignored, fall through for now */ 8108 } 8109 8110 switch (offset) { 8111 case NVME_REG_INTMS: 8112 if (unlikely(msix_enabled(pci))) { 8113 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix, 8114 "undefined access to interrupt mask set" 8115 " when MSI-X is enabled"); 8116 /* should be ignored, fall through for now */ 8117 } 8118 intms |= data; 8119 stl_le_p(&n->bar.intms, intms); 8120 n->bar.intmc = n->bar.intms; 8121 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms); 8122 nvme_irq_check(n); 8123 break; 8124 case NVME_REG_INTMC: 8125 if (unlikely(msix_enabled(pci))) { 8126 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix, 8127 "undefined access to interrupt mask clr" 8128 " when MSI-X is enabled"); 8129 /* should be ignored, fall through for now */ 8130 } 8131 intms &= ~data; 8132 stl_le_p(&n->bar.intms, intms); 8133 n->bar.intmc = n->bar.intms; 8134 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms); 8135 nvme_irq_check(n); 8136 break; 8137 case NVME_REG_CC: 8138 stl_le_p(&n->bar.cc, data); 8139 8140 trace_pci_nvme_mmio_cfg(data & 0xffffffff); 8141 8142 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) { 8143 trace_pci_nvme_mmio_shutdown_set(); 8144 nvme_ctrl_shutdown(n); 8145 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT); 8146 csts |= NVME_CSTS_SHST_COMPLETE; 8147 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) { 8148 trace_pci_nvme_mmio_shutdown_cleared(); 8149 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT); 8150 } 8151 8152 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) { 8153 if (unlikely(nvme_start_ctrl(n))) { 8154 trace_pci_nvme_err_startfail(); 8155 csts = NVME_CSTS_FAILED; 8156 } else { 8157 trace_pci_nvme_mmio_start_success(); 8158 csts = NVME_CSTS_READY; 8159 } 8160 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) { 8161 trace_pci_nvme_mmio_stopped(); 8162 nvme_ctrl_reset(n, NVME_RESET_CONTROLLER); 8163 8164 break; 8165 } 8166 8167 stl_le_p(&n->bar.csts, csts); 8168 8169 break; 8170 case NVME_REG_CSTS: 8171 if (data & (1 << 4)) { 8172 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported, 8173 "attempted to W1C CSTS.NSSRO" 8174 " but CAP.NSSRS is zero (not supported)"); 8175 } else if (data != 0) { 8176 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts, 8177 "attempted to set a read only bit" 8178 " of controller status"); 8179 } 8180 break; 8181 case NVME_REG_NSSR: 8182 if (data == 0x4e564d65) { 8183 trace_pci_nvme_ub_mmiowr_ssreset_unsupported(); 8184 } else { 8185 /* The spec says that writes of other values have no effect */ 8186 return; 8187 } 8188 break; 8189 case NVME_REG_AQA: 8190 stl_le_p(&n->bar.aqa, data); 8191 trace_pci_nvme_mmio_aqattr(data & 0xffffffff); 8192 break; 8193 case NVME_REG_ASQ: 8194 stn_le_p(&n->bar.asq, size, data); 8195 trace_pci_nvme_mmio_asqaddr(data); 8196 break; 8197 case NVME_REG_ASQ + 4: 8198 stl_le_p((uint8_t *)&n->bar.asq + 4, data); 8199 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq)); 8200 break; 8201 case NVME_REG_ACQ: 8202 trace_pci_nvme_mmio_acqaddr(data); 8203 stn_le_p(&n->bar.acq, size, data); 8204 break; 8205 case NVME_REG_ACQ + 4: 8206 stl_le_p((uint8_t *)&n->bar.acq + 4, data); 8207 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq)); 8208 break; 8209 case NVME_REG_CMBLOC: 8210 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved, 8211 "invalid write to reserved CMBLOC" 8212 " when CMBSZ is zero, ignored"); 8213 return; 8214 case NVME_REG_CMBSZ: 8215 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly, 8216 "invalid write to read only CMBSZ, ignored"); 8217 return; 8218 case NVME_REG_CMBMSC: 8219 if (!NVME_CAP_CMBS(cap)) { 8220 return; 8221 } 8222 8223 stn_le_p(&n->bar.cmbmsc, size, data); 8224 n->cmb.cmse = false; 8225 8226 if (NVME_CMBMSC_CRE(data)) { 8227 nvme_cmb_enable_regs(n); 8228 8229 if (NVME_CMBMSC_CMSE(data)) { 8230 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc); 8231 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT; 8232 if (cba + int128_get64(n->cmb.mem.size) < cba) { 8233 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts); 8234 NVME_CMBSTS_SET_CBAI(cmbsts, 1); 8235 stl_le_p(&n->bar.cmbsts, cmbsts); 8236 return; 8237 } 8238 8239 n->cmb.cba = cba; 8240 n->cmb.cmse = true; 8241 } 8242 } else { 8243 n->bar.cmbsz = 0; 8244 n->bar.cmbloc = 0; 8245 } 8246 8247 return; 8248 case NVME_REG_CMBMSC + 4: 8249 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data); 8250 return; 8251 8252 case NVME_REG_PMRCAP: 8253 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly, 8254 "invalid write to PMRCAP register, ignored"); 8255 return; 8256 case NVME_REG_PMRCTL: 8257 if (!NVME_CAP_PMRS(cap)) { 8258 return; 8259 } 8260 8261 stl_le_p(&n->bar.pmrctl, data); 8262 if (NVME_PMRCTL_EN(data)) { 8263 memory_region_set_enabled(&n->pmr.dev->mr, true); 8264 pmrsts = 0; 8265 } else { 8266 memory_region_set_enabled(&n->pmr.dev->mr, false); 8267 NVME_PMRSTS_SET_NRDY(pmrsts, 1); 8268 n->pmr.cmse = false; 8269 } 8270 stl_le_p(&n->bar.pmrsts, pmrsts); 8271 return; 8272 case NVME_REG_PMRSTS: 8273 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly, 8274 "invalid write to PMRSTS register, ignored"); 8275 return; 8276 case NVME_REG_PMREBS: 8277 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly, 8278 "invalid write to PMREBS register, ignored"); 8279 return; 8280 case NVME_REG_PMRSWTP: 8281 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly, 8282 "invalid write to PMRSWTP register, ignored"); 8283 return; 8284 case NVME_REG_PMRMSCL: 8285 if (!NVME_CAP_PMRS(cap)) { 8286 return; 8287 } 8288 8289 stl_le_p(&n->bar.pmrmscl, data); 8290 n->pmr.cmse = false; 8291 8292 if (NVME_PMRMSCL_CMSE(data)) { 8293 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu); 8294 hwaddr cba = pmrmscu << 32 | 8295 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT); 8296 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) { 8297 NVME_PMRSTS_SET_CBAI(pmrsts, 1); 8298 stl_le_p(&n->bar.pmrsts, pmrsts); 8299 return; 8300 } 8301 8302 n->pmr.cmse = true; 8303 n->pmr.cba = cba; 8304 } 8305 8306 return; 8307 case NVME_REG_PMRMSCU: 8308 if (!NVME_CAP_PMRS(cap)) { 8309 return; 8310 } 8311 8312 stl_le_p(&n->bar.pmrmscu, data); 8313 return; 8314 default: 8315 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid, 8316 "invalid MMIO write," 8317 " offset=0x%"PRIx64", data=%"PRIx64"", 8318 offset, data); 8319 break; 8320 } 8321 } 8322 8323 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) 8324 { 8325 NvmeCtrl *n = (NvmeCtrl *)opaque; 8326 uint8_t *ptr = (uint8_t *)&n->bar; 8327 8328 trace_pci_nvme_mmio_read(addr, size); 8329 8330 if (unlikely(addr & (sizeof(uint32_t) - 1))) { 8331 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32, 8332 "MMIO read not 32-bit aligned," 8333 " offset=0x%"PRIx64"", addr); 8334 /* should RAZ, fall through for now */ 8335 } else if (unlikely(size < sizeof(uint32_t))) { 8336 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall, 8337 "MMIO read smaller than 32-bits," 8338 " offset=0x%"PRIx64"", addr); 8339 /* should RAZ, fall through for now */ 8340 } 8341 8342 if (addr > sizeof(n->bar) - size) { 8343 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs, 8344 "MMIO read beyond last register," 8345 " offset=0x%"PRIx64", returning 0", addr); 8346 8347 return 0; 8348 } 8349 8350 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs && 8351 addr != NVME_REG_CSTS) { 8352 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size); 8353 return 0; 8354 } 8355 8356 /* 8357 * When PMRWBM bit 1 is set then read from 8358 * from PMRSTS should ensure prior writes 8359 * made it to persistent media 8360 */ 8361 if (addr == NVME_REG_PMRSTS && 8362 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) { 8363 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size); 8364 } 8365 8366 return ldn_le_p(ptr + addr, size); 8367 } 8368 8369 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) 8370 { 8371 PCIDevice *pci = PCI_DEVICE(n); 8372 uint32_t qid; 8373 8374 if (unlikely(addr & ((1 << 2) - 1))) { 8375 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned, 8376 "doorbell write not 32-bit aligned," 8377 " offset=0x%"PRIx64", ignoring", addr); 8378 return; 8379 } 8380 8381 if (((addr - 0x1000) >> 2) & 1) { 8382 /* Completion queue doorbell write */ 8383 8384 uint16_t new_head = val & 0xffff; 8385 NvmeCQueue *cq; 8386 8387 qid = (addr - (0x1000 + (1 << 2))) >> 3; 8388 if (unlikely(nvme_check_cqid(n, qid))) { 8389 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq, 8390 "completion queue doorbell write" 8391 " for nonexistent queue," 8392 " sqid=%"PRIu32", ignoring", qid); 8393 8394 /* 8395 * NVM Express v1.3d, Section 4.1 state: "If host software writes 8396 * an invalid value to the Submission Queue Tail Doorbell or 8397 * Completion Queue Head Doorbell register and an Asynchronous Event 8398 * Request command is outstanding, then an asynchronous event is 8399 * posted to the Admin Completion Queue with a status code of 8400 * Invalid Doorbell Write Value." 8401 * 8402 * Also note that the spec includes the "Invalid Doorbell Register" 8403 * status code, but nowhere does it specify when to use it. 8404 * However, it seems reasonable to use it here in a similar 8405 * fashion. 8406 */ 8407 if (n->outstanding_aers) { 8408 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 8409 NVME_AER_INFO_ERR_INVALID_DB_REGISTER, 8410 NVME_LOG_ERROR_INFO); 8411 } 8412 8413 return; 8414 } 8415 8416 cq = n->cq[qid]; 8417 if (unlikely(new_head >= cq->size)) { 8418 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead, 8419 "completion queue doorbell write value" 8420 " beyond queue size, sqid=%"PRIu32"," 8421 " new_head=%"PRIu16", ignoring", 8422 qid, new_head); 8423 8424 if (n->outstanding_aers) { 8425 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 8426 NVME_AER_INFO_ERR_INVALID_DB_VALUE, 8427 NVME_LOG_ERROR_INFO); 8428 } 8429 8430 return; 8431 } 8432 8433 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head); 8434 8435 /* scheduled deferred cqe posting if queue was previously full */ 8436 if (nvme_cq_full(cq)) { 8437 qemu_bh_schedule(cq->bh); 8438 } 8439 8440 cq->head = new_head; 8441 if (!qid && n->dbbuf_enabled) { 8442 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED); 8443 } 8444 8445 if (cq->tail == cq->head) { 8446 if (cq->irq_enabled) { 8447 n->cq_pending--; 8448 } 8449 8450 nvme_irq_deassert(n, cq); 8451 } 8452 } else { 8453 /* Submission queue doorbell write */ 8454 8455 uint16_t new_tail = val & 0xffff; 8456 NvmeSQueue *sq; 8457 8458 qid = (addr - 0x1000) >> 3; 8459 if (unlikely(nvme_check_sqid(n, qid))) { 8460 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq, 8461 "submission queue doorbell write" 8462 " for nonexistent queue," 8463 " sqid=%"PRIu32", ignoring", qid); 8464 8465 if (n->outstanding_aers) { 8466 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 8467 NVME_AER_INFO_ERR_INVALID_DB_REGISTER, 8468 NVME_LOG_ERROR_INFO); 8469 } 8470 8471 return; 8472 } 8473 8474 sq = n->sq[qid]; 8475 if (unlikely(new_tail >= sq->size)) { 8476 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail, 8477 "submission queue doorbell write value" 8478 " beyond queue size, sqid=%"PRIu32"," 8479 " new_tail=%"PRIu16", ignoring", 8480 qid, new_tail); 8481 8482 if (n->outstanding_aers) { 8483 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 8484 NVME_AER_INFO_ERR_INVALID_DB_VALUE, 8485 NVME_LOG_ERROR_INFO); 8486 } 8487 8488 return; 8489 } 8490 8491 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail); 8492 8493 sq->tail = new_tail; 8494 if (!qid && n->dbbuf_enabled) { 8495 /* 8496 * The spec states "the host shall also update the controller's 8497 * corresponding doorbell property to match the value of that entry 8498 * in the Shadow Doorbell buffer." 8499 * 8500 * Since this context is currently a VM trap, we can safely enforce 8501 * the requirement from the device side in case the host is 8502 * misbehaving. 8503 * 8504 * Note, we shouldn't have to do this, but various drivers 8505 * including ones that run on Linux, are not updating Admin Queues, 8506 * so we can't trust reading it for an appropriate sq tail. 8507 */ 8508 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED); 8509 } 8510 8511 qemu_bh_schedule(sq->bh); 8512 } 8513 } 8514 8515 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data, 8516 unsigned size) 8517 { 8518 NvmeCtrl *n = (NvmeCtrl *)opaque; 8519 8520 trace_pci_nvme_mmio_write(addr, data, size); 8521 8522 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs && 8523 addr != NVME_REG_CSTS) { 8524 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size); 8525 return; 8526 } 8527 8528 if (addr < sizeof(n->bar)) { 8529 nvme_write_bar(n, addr, data, size); 8530 } else { 8531 nvme_process_db(n, addr, data); 8532 } 8533 } 8534 8535 static const MemoryRegionOps nvme_mmio_ops = { 8536 .read = nvme_mmio_read, 8537 .write = nvme_mmio_write, 8538 .endianness = DEVICE_LITTLE_ENDIAN, 8539 .impl = { 8540 .min_access_size = 2, 8541 .max_access_size = 8, 8542 }, 8543 }; 8544 8545 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data, 8546 unsigned size) 8547 { 8548 NvmeCtrl *n = (NvmeCtrl *)opaque; 8549 stn_le_p(&n->cmb.buf[addr], size, data); 8550 } 8551 8552 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size) 8553 { 8554 NvmeCtrl *n = (NvmeCtrl *)opaque; 8555 return ldn_le_p(&n->cmb.buf[addr], size); 8556 } 8557 8558 static const MemoryRegionOps nvme_cmb_ops = { 8559 .read = nvme_cmb_read, 8560 .write = nvme_cmb_write, 8561 .endianness = DEVICE_LITTLE_ENDIAN, 8562 .impl = { 8563 .min_access_size = 1, 8564 .max_access_size = 8, 8565 }, 8566 }; 8567 8568 static bool nvme_check_params(NvmeCtrl *n, Error **errp) 8569 { 8570 NvmeParams *params = &n->params; 8571 8572 if (params->num_queues) { 8573 warn_report("num_queues is deprecated; please use max_ioqpairs " 8574 "instead"); 8575 8576 params->max_ioqpairs = params->num_queues - 1; 8577 } 8578 8579 if (n->namespace.blkconf.blk && n->subsys) { 8580 error_setg(errp, "subsystem support is unavailable with legacy " 8581 "namespace ('drive' property)"); 8582 return false; 8583 } 8584 8585 if (params->max_ioqpairs < 1 || 8586 params->max_ioqpairs > NVME_MAX_IOQPAIRS) { 8587 error_setg(errp, "max_ioqpairs must be between 1 and %d", 8588 NVME_MAX_IOQPAIRS); 8589 return false; 8590 } 8591 8592 if (params->msix_qsize < 1 || 8593 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) { 8594 error_setg(errp, "msix_qsize must be between 1 and %d", 8595 PCI_MSIX_FLAGS_QSIZE + 1); 8596 return false; 8597 } 8598 8599 if (!params->serial) { 8600 error_setg(errp, "serial property not set"); 8601 return false; 8602 } 8603 8604 if (params->mqes < 1) { 8605 error_setg(errp, "mqes property cannot be less than 1"); 8606 return false; 8607 } 8608 8609 if (n->pmr.dev) { 8610 if (params->msix_exclusive_bar) { 8611 error_setg(errp, "not enough BARs available to enable PMR"); 8612 return false; 8613 } 8614 8615 if (host_memory_backend_is_mapped(n->pmr.dev)) { 8616 error_setg(errp, "can't use already busy memdev: %s", 8617 object_get_canonical_path_component(OBJECT(n->pmr.dev))); 8618 return false; 8619 } 8620 8621 if (!is_power_of_2(n->pmr.dev->size)) { 8622 error_setg(errp, "pmr backend size needs to be power of 2 in size"); 8623 return false; 8624 } 8625 8626 host_memory_backend_set_mapped(n->pmr.dev, true); 8627 } 8628 8629 if (!n->params.mdts || ((1 << n->params.mdts) + 1) > IOV_MAX) { 8630 error_setg(errp, "mdts exceeds IOV_MAX"); 8631 return false; 8632 } 8633 8634 if (n->params.zasl > n->params.mdts) { 8635 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less " 8636 "than or equal to mdts (Maximum Data Transfer Size)"); 8637 return false; 8638 } 8639 8640 if (!n->params.vsl) { 8641 error_setg(errp, "vsl must be non-zero"); 8642 return false; 8643 } 8644 8645 if (params->sriov_max_vfs) { 8646 if (!n->subsys) { 8647 error_setg(errp, "subsystem is required for the use of SR-IOV"); 8648 return false; 8649 } 8650 8651 if (params->cmb_size_mb) { 8652 error_setg(errp, "CMB is not supported with SR-IOV"); 8653 return false; 8654 } 8655 8656 if (n->pmr.dev) { 8657 error_setg(errp, "PMR is not supported with SR-IOV"); 8658 return false; 8659 } 8660 8661 if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) { 8662 error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible" 8663 " must be set for the use of SR-IOV"); 8664 return false; 8665 } 8666 8667 if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) { 8668 error_setg(errp, "sriov_vq_flexible must be greater than or equal" 8669 " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2); 8670 return false; 8671 } 8672 8673 if (params->max_ioqpairs < params->sriov_vq_flexible + 2) { 8674 error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be" 8675 " greater than or equal to 2"); 8676 return false; 8677 } 8678 8679 if (params->sriov_vi_flexible < params->sriov_max_vfs) { 8680 error_setg(errp, "sriov_vi_flexible must be greater than or equal" 8681 " to %d (sriov_max_vfs)", params->sriov_max_vfs); 8682 return false; 8683 } 8684 8685 if (params->msix_qsize < params->sriov_vi_flexible + 1) { 8686 error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be" 8687 " greater than or equal to 1"); 8688 return false; 8689 } 8690 8691 if (params->sriov_max_vi_per_vf && 8692 (params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) { 8693 error_setg(errp, "sriov_max_vi_per_vf must meet:" 8694 " (sriov_max_vi_per_vf - 1) %% %d == 0 and" 8695 " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY); 8696 return false; 8697 } 8698 8699 if (params->sriov_max_vq_per_vf && 8700 (params->sriov_max_vq_per_vf < 2 || 8701 (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) { 8702 error_setg(errp, "sriov_max_vq_per_vf must meet:" 8703 " (sriov_max_vq_per_vf - 1) %% %d == 0 and" 8704 " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY); 8705 return false; 8706 } 8707 } 8708 8709 return true; 8710 } 8711 8712 static void nvme_init_state(NvmeCtrl *n) 8713 { 8714 NvmePriCtrlCap *cap = &n->pri_ctrl_cap; 8715 NvmeSecCtrlEntry *list = n->sec_ctrl_list; 8716 NvmeSecCtrlEntry *sctrl; 8717 PCIDevice *pci = PCI_DEVICE(n); 8718 NvmeIdCtrl *id = &n->id_ctrl; 8719 uint8_t max_vfs; 8720 int i; 8721 8722 if (pci_is_vf(pci)) { 8723 sctrl = nvme_sctrl(n); 8724 max_vfs = 0; 8725 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0; 8726 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1; 8727 } else { 8728 max_vfs = n->params.sriov_max_vfs; 8729 n->conf_ioqpairs = n->params.max_ioqpairs; 8730 n->conf_msix_qsize = n->params.msix_qsize; 8731 } 8732 8733 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1); 8734 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1); 8735 n->temperature = NVME_TEMPERATURE; 8736 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING; 8737 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 8738 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1); 8739 QTAILQ_INIT(&n->aer_queue); 8740 8741 n->nr_sec_ctrls = max_vfs; 8742 for (i = 0; i < max_vfs; i++) { 8743 sctrl = &list[i]; 8744 sctrl->pcid = cpu_to_le16(n->cntlid); 8745 sctrl->vfn = cpu_to_le16(i + 1); 8746 } 8747 8748 n->spdm_socket = -1; 8749 8750 cap->cntlid = cpu_to_le16(n->cntlid); 8751 cap->crt = NVME_CRT_VQ | NVME_CRT_VI; 8752 8753 if (pci_is_vf(pci)) { 8754 cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs); 8755 } else { 8756 cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs - 8757 n->params.sriov_vq_flexible); 8758 cap->vqfrt = cpu_to_le32(n->params.sriov_vq_flexible); 8759 cap->vqrfap = cap->vqfrt; 8760 cap->vqgran = cpu_to_le16(NVME_VF_RES_GRANULARITY); 8761 cap->vqfrsm = n->params.sriov_max_vq_per_vf ? 8762 cpu_to_le16(n->params.sriov_max_vq_per_vf) : 8763 cap->vqfrt / MAX(max_vfs, 1); 8764 } 8765 8766 if (pci_is_vf(pci)) { 8767 cap->viprt = cpu_to_le16(n->conf_msix_qsize); 8768 } else { 8769 cap->viprt = cpu_to_le16(n->params.msix_qsize - 8770 n->params.sriov_vi_flexible); 8771 cap->vifrt = cpu_to_le32(n->params.sriov_vi_flexible); 8772 cap->virfap = cap->vifrt; 8773 cap->vigran = cpu_to_le16(NVME_VF_RES_GRANULARITY); 8774 cap->vifrsm = n->params.sriov_max_vi_per_vf ? 8775 cpu_to_le16(n->params.sriov_max_vi_per_vf) : 8776 cap->vifrt / MAX(max_vfs, 1); 8777 } 8778 8779 /* Atomic Write */ 8780 id->awun = cpu_to_le16(n->params.atomic_awun); 8781 id->awupf = cpu_to_le16(n->params.atomic_awupf); 8782 n->dn = n->params.atomic_dn; 8783 8784 if (id->awun || id->awupf) { 8785 if (id->awupf > id->awun) { 8786 id->awupf = 0; 8787 } 8788 8789 nvme_atomic_configure_max_write_size(n->dn, n->params.atomic_awun, 8790 n->params.atomic_awupf, 8791 &n->atomic); 8792 } 8793 } 8794 8795 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) 8796 { 8797 uint64_t cmb_size = n->params.cmb_size_mb * MiB; 8798 uint64_t cap = ldq_le_p(&n->bar.cap); 8799 8800 n->cmb.buf = g_malloc0(cmb_size); 8801 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n, 8802 "nvme-cmb", cmb_size); 8803 pci_register_bar(pci_dev, NVME_CMB_BIR, 8804 PCI_BASE_ADDRESS_SPACE_MEMORY | 8805 PCI_BASE_ADDRESS_MEM_TYPE_64 | 8806 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem); 8807 8808 NVME_CAP_SET_CMBS(cap, 1); 8809 stq_le_p(&n->bar.cap, cap); 8810 8811 if (n->params.legacy_cmb) { 8812 nvme_cmb_enable_regs(n); 8813 n->cmb.cmse = true; 8814 } 8815 } 8816 8817 static bool nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) 8818 { 8819 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap); 8820 8821 if (memory_region_size(&n->pmr.dev->mr) < 16) { 8822 error_setg(errp, "PMR device must have at least 16 bytes"); 8823 return false; 8824 } 8825 8826 NVME_PMRCAP_SET_RDS(pmrcap, 1); 8827 NVME_PMRCAP_SET_WDS(pmrcap, 1); 8828 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR); 8829 /* Turn on bit 1 support */ 8830 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02); 8831 NVME_PMRCAP_SET_CMSS(pmrcap, 1); 8832 stl_le_p(&n->bar.pmrcap, pmrcap); 8833 8834 pci_register_bar(pci_dev, NVME_PMR_BIR, 8835 PCI_BASE_ADDRESS_SPACE_MEMORY | 8836 PCI_BASE_ADDRESS_MEM_TYPE_64 | 8837 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr); 8838 8839 memory_region_set_enabled(&n->pmr.dev->mr, false); 8840 8841 return true; 8842 } 8843 8844 static uint64_t nvme_mbar_size(unsigned total_queues, unsigned total_irqs, 8845 unsigned *msix_table_offset, 8846 unsigned *msix_pba_offset) 8847 { 8848 uint64_t bar_size, msix_table_size; 8849 8850 bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE; 8851 8852 if (total_irqs == 0) { 8853 goto out; 8854 } 8855 8856 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB); 8857 8858 if (msix_table_offset) { 8859 *msix_table_offset = bar_size; 8860 } 8861 8862 msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs; 8863 bar_size += msix_table_size; 8864 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB); 8865 8866 if (msix_pba_offset) { 8867 *msix_pba_offset = bar_size; 8868 } 8869 8870 bar_size += QEMU_ALIGN_UP(total_irqs, 64) / 8; 8871 8872 out: 8873 return pow2ceil(bar_size); 8874 } 8875 8876 static bool nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset, 8877 Error **errp) 8878 { 8879 uint16_t vf_dev_id = n->params.use_intel_id ? 8880 PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME; 8881 NvmePriCtrlCap *cap = &n->pri_ctrl_cap; 8882 uint64_t bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm), 8883 le16_to_cpu(cap->vifrsm), 8884 NULL, NULL); 8885 8886 if (!pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id, 8887 n->params.sriov_max_vfs, n->params.sriov_max_vfs, 8888 NVME_VF_OFFSET, NVME_VF_STRIDE, errp)) { 8889 return false; 8890 } 8891 8892 pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | 8893 PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size); 8894 8895 return true; 8896 } 8897 8898 static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset) 8899 { 8900 Error *err = NULL; 8901 int ret; 8902 8903 ret = pci_pm_init(pci_dev, offset, &err); 8904 if (err) { 8905 error_report_err(err); 8906 return ret; 8907 } 8908 8909 pci_set_word(pci_dev->config + offset + PCI_PM_PMC, 8910 PCI_PM_CAP_VER_1_2); 8911 pci_set_word(pci_dev->config + offset + PCI_PM_CTRL, 8912 PCI_PM_CTRL_NO_SOFT_RESET); 8913 pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL, 8914 PCI_PM_CTRL_STATE_MASK); 8915 8916 return 0; 8917 } 8918 8919 static bool pcie_doe_spdm_rsp(DOECap *doe_cap) 8920 { 8921 void *req = pcie_doe_get_write_mbox_ptr(doe_cap); 8922 uint32_t req_len = pcie_doe_get_obj_len(req) * 4; 8923 void *rsp = doe_cap->read_mbox; 8924 uint32_t rsp_len = SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE; 8925 8926 uint32_t recvd = spdm_socket_rsp(doe_cap->spdm_socket, 8927 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE, 8928 req, req_len, rsp, rsp_len); 8929 doe_cap->read_mbox_len += DIV_ROUND_UP(recvd, 4); 8930 8931 return recvd != 0; 8932 } 8933 8934 static DOEProtocol doe_spdm_prot[] = { 8935 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_CMA, pcie_doe_spdm_rsp }, 8936 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_SECURED_CMA, pcie_doe_spdm_rsp }, 8937 { } 8938 }; 8939 8940 static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) 8941 { 8942 ERRP_GUARD(); 8943 uint8_t *pci_conf = pci_dev->config; 8944 uint64_t bar_size; 8945 unsigned msix_table_offset = 0, msix_pba_offset = 0; 8946 unsigned nr_vectors; 8947 int ret; 8948 8949 pci_conf[PCI_INTERRUPT_PIN] = pci_is_vf(pci_dev) ? 0 : 1; 8950 pci_config_set_prog_interface(pci_conf, 0x2); 8951 8952 if (n->params.use_intel_id) { 8953 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL); 8954 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME); 8955 } else { 8956 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT); 8957 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME); 8958 } 8959 8960 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS); 8961 nvme_add_pm_capability(pci_dev, 0x60); 8962 pcie_endpoint_cap_init(pci_dev, 0x80); 8963 pcie_cap_flr_init(pci_dev); 8964 if (n->params.sriov_max_vfs) { 8965 pcie_ari_init(pci_dev, 0x100); 8966 } 8967 8968 if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) { 8969 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, 0, NULL, NULL); 8970 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme", 8971 bar_size); 8972 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | 8973 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem); 8974 ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp); 8975 } else { 8976 assert(n->params.msix_qsize >= 1); 8977 8978 /* add one to max_ioqpairs to account for the admin queue pair */ 8979 if (!pci_is_vf(pci_dev)) { 8980 nr_vectors = n->params.msix_qsize; 8981 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, 8982 nr_vectors, &msix_table_offset, 8983 &msix_pba_offset); 8984 } else { 8985 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev)); 8986 NvmePriCtrlCap *cap = &pn->pri_ctrl_cap; 8987 8988 nr_vectors = le16_to_cpu(cap->vifrsm); 8989 bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm), nr_vectors, 8990 &msix_table_offset, &msix_pba_offset); 8991 } 8992 8993 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size); 8994 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme", 8995 msix_table_offset); 8996 memory_region_add_subregion(&n->bar0, 0, &n->iomem); 8997 8998 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | 8999 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0); 9000 9001 ret = msix_init(pci_dev, nr_vectors, 9002 &n->bar0, 0, msix_table_offset, 9003 &n->bar0, 0, msix_pba_offset, 0, errp); 9004 } 9005 9006 if (ret == -ENOTSUP) { 9007 /* report that msix is not supported, but do not error out */ 9008 warn_report_err(*errp); 9009 *errp = NULL; 9010 } else if (ret < 0) { 9011 /* propagate error to caller */ 9012 return false; 9013 } 9014 9015 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs && 9016 !nvme_init_sriov(n, pci_dev, 0x120, errp)) { 9017 return false; 9018 } 9019 9020 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize); 9021 9022 pcie_cap_deverr_init(pci_dev); 9023 9024 /* SPDM Initialisation */ 9025 if (pci_dev->spdm_port) { 9026 uint16_t doe_offset = PCI_CONFIG_SPACE_SIZE; 9027 9028 switch (pci_dev->spdm_trans) { 9029 case SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE: 9030 if (n->params.sriov_max_vfs) { 9031 doe_offset += PCI_ARI_SIZEOF; 9032 } 9033 9034 pcie_doe_init(pci_dev, &pci_dev->doe_spdm, doe_offset, 9035 doe_spdm_prot, true, 0); 9036 9037 pci_dev->doe_spdm.spdm_socket = 9038 spdm_socket_connect(pci_dev->spdm_port, errp); 9039 9040 if (pci_dev->doe_spdm.spdm_socket < 0) { 9041 return false; 9042 } 9043 break; 9044 case SPDM_SOCKET_TRANSPORT_TYPE_NVME: 9045 n->spdm_socket = spdm_socket_connect(pci_dev->spdm_port, errp); 9046 if (n->spdm_socket < 0) { 9047 return false; 9048 } 9049 break; 9050 default: 9051 return false; 9052 } 9053 } 9054 9055 if (n->params.cmb_size_mb) { 9056 nvme_init_cmb(n, pci_dev); 9057 } 9058 9059 if (n->pmr.dev) { 9060 if (!nvme_init_pmr(n, pci_dev, errp)) { 9061 return false; 9062 } 9063 } 9064 9065 return true; 9066 } 9067 9068 static void nvme_init_subnqn(NvmeCtrl *n) 9069 { 9070 NvmeSubsystem *subsys = n->subsys; 9071 NvmeIdCtrl *id = &n->id_ctrl; 9072 9073 if (!subsys) { 9074 snprintf((char *)id->subnqn, sizeof(id->subnqn), 9075 "nqn.2019-08.org.qemu:%s", n->params.serial); 9076 } else { 9077 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn); 9078 } 9079 } 9080 9081 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) 9082 { 9083 NvmeIdCtrl *id = &n->id_ctrl; 9084 uint8_t *pci_conf = pci_dev->config; 9085 uint64_t cap = ldq_le_p(&n->bar.cap); 9086 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n); 9087 uint32_t ctratt = le32_to_cpu(id->ctratt); 9088 uint16_t oacs; 9089 9090 memcpy(n->cse.acs, nvme_cse_acs_default, sizeof(n->cse.acs)); 9091 memcpy(n->cse.iocs.nvm, nvme_cse_iocs_nvm_default, sizeof(n->cse.iocs.nvm)); 9092 memcpy(n->cse.iocs.zoned, nvme_cse_iocs_zoned_default, 9093 sizeof(n->cse.iocs.zoned)); 9094 9095 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID)); 9096 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID)); 9097 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' '); 9098 strpadcpy((char *)id->fr, sizeof(id->fr), QEMU_VERSION, ' '); 9099 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' '); 9100 9101 id->cntlid = cpu_to_le16(n->cntlid); 9102 9103 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR); 9104 9105 ctratt |= NVME_CTRATT_ELBAS; 9106 if (n->params.ctratt.mem) { 9107 ctratt |= NVME_CTRATT_MEM; 9108 } 9109 id->ctratt = cpu_to_le32(ctratt); 9110 9111 id->rab = 6; 9112 9113 if (n->params.use_intel_id) { 9114 id->ieee[0] = 0xb3; 9115 id->ieee[1] = 0x02; 9116 id->ieee[2] = 0x00; 9117 } else { 9118 id->ieee[0] = 0x00; 9119 id->ieee[1] = 0x54; 9120 id->ieee[2] = 0x52; 9121 } 9122 9123 id->mdts = n->params.mdts; 9124 id->ver = cpu_to_le32(NVME_SPEC_VER); 9125 9126 oacs = NVME_OACS_NMS | NVME_OACS_FORMAT | NVME_OACS_DIRECTIVES | 9127 NVME_OACS_SECURITY; 9128 9129 if (n->params.dbcs) { 9130 oacs |= NVME_OACS_DBCS; 9131 9132 n->cse.acs[NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP; 9133 } 9134 9135 if (n->params.sriov_max_vfs) { 9136 oacs |= NVME_OACS_VMS; 9137 9138 n->cse.acs[NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP; 9139 } 9140 9141 id->oacs = cpu_to_le16(oacs); 9142 9143 id->cntrltype = 0x1; 9144 9145 /* 9146 * Because the controller always completes the Abort command immediately, 9147 * there can never be more than one concurrently executing Abort command, 9148 * so this value is never used for anything. Note that there can easily be 9149 * many Abort commands in the queues, but they are not considered 9150 * "executing" until processed by nvme_abort. 9151 * 9152 * The specification recommends a value of 3 for Abort Command Limit (four 9153 * concurrently outstanding Abort commands), so lets use that though it is 9154 * inconsequential. 9155 */ 9156 id->acl = 3; 9157 id->aerl = n->params.aerl; 9158 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO; 9159 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED; 9160 9161 /* recommended default value (~70 C) */ 9162 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING); 9163 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL); 9164 9165 id->sqes = (NVME_SQES << 4) | NVME_SQES; 9166 id->cqes = (NVME_CQES << 4) | NVME_CQES; 9167 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES); 9168 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | 9169 NVME_ONCS_FEATURES | NVME_ONCS_DSM | 9170 NVME_ONCS_COMPARE | NVME_ONCS_COPY | 9171 NVME_ONCS_NVMCSA | NVME_ONCS_NVMAFC); 9172 9173 /* 9174 * NOTE: If this device ever supports a command set that does NOT use 0x0 9175 * as a Flush-equivalent operation, support for the broadcast NSID in Flush 9176 * should probably be removed. 9177 * 9178 * See comment in nvme_io_cmd. 9179 */ 9180 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT; 9181 9182 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1 | 9183 NVME_OCFS_COPY_FORMAT_2 | NVME_OCFS_COPY_FORMAT_3); 9184 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | 9185 NVME_CTRL_SGLS_MPTR_SGL); 9186 9187 nvme_init_subnqn(n); 9188 9189 id->psd[0].mp = cpu_to_le16(0x9c4); 9190 id->psd[0].enlat = cpu_to_le32(0x10); 9191 id->psd[0].exlat = cpu_to_le32(0x4); 9192 9193 NVME_CAP_SET_MQES(cap, n->params.mqes); 9194 NVME_CAP_SET_CQR(cap, 1); 9195 NVME_CAP_SET_TO(cap, 0xf); 9196 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NCSS); 9197 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_IOCSS); 9198 NVME_CAP_SET_MPSMAX(cap, 4); 9199 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0); 9200 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0); 9201 stq_le_p(&n->bar.cap, cap); 9202 9203 stl_le_p(&n->bar.vs, NVME_SPEC_VER); 9204 n->bar.intmc = n->bar.intms = 0; 9205 9206 if (pci_is_vf(pci_dev) && !sctrl->scs) { 9207 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED); 9208 } 9209 } 9210 9211 static int nvme_init_subsys(NvmeCtrl *n, Error **errp) 9212 { 9213 int cntlid; 9214 9215 if (!n->subsys) { 9216 DeviceState *dev = qdev_new(TYPE_NVME_SUBSYS); 9217 9218 qdev_prop_set_string(dev, "nqn", n->params.serial); 9219 9220 if (!qdev_realize(dev, NULL, errp)) { 9221 return -1; 9222 } 9223 9224 n->subsys = NVME_SUBSYS(dev); 9225 } else { 9226 NvmeIdCtrl *id = &n->id_ctrl; 9227 uint32_t ctratt = le32_to_cpu(id->ctratt); 9228 9229 id->cmic |= NVME_CMIC_MULTI_CTRL; 9230 ctratt |= NVME_CTRATT_ENDGRPS; 9231 9232 id->endgidmax = cpu_to_le16(0x1); 9233 9234 if (n->subsys->endgrp.fdp.enabled) { 9235 ctratt |= NVME_CTRATT_FDPS; 9236 } 9237 9238 id->ctratt = cpu_to_le32(ctratt); 9239 } 9240 9241 cntlid = nvme_subsys_register_ctrl(n, errp); 9242 if (cntlid < 0) { 9243 return -1; 9244 } 9245 9246 n->cntlid = cntlid; 9247 9248 return 0; 9249 } 9250 9251 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns) 9252 { 9253 uint32_t nsid = ns->params.nsid; 9254 assert(nsid && nsid <= NVME_MAX_NAMESPACES); 9255 9256 n->namespaces[nsid] = ns; 9257 ns->attached++; 9258 } 9259 9260 static void nvme_realize(PCIDevice *pci_dev, Error **errp) 9261 { 9262 NvmeCtrl *n = NVME(pci_dev); 9263 DeviceState *dev = DEVICE(pci_dev); 9264 NvmeNamespace *ns; 9265 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev)); 9266 9267 if (pci_is_vf(pci_dev)) { 9268 /* 9269 * VFs derive settings from the parent. PF's lifespan exceeds 9270 * that of VF's. 9271 */ 9272 memcpy(&n->params, &pn->params, sizeof(NvmeParams)); 9273 9274 /* 9275 * Set PF's serial value to a new string memory to prevent 'serial' 9276 * property object release of PF when a VF is removed from the system. 9277 */ 9278 n->params.serial = g_strdup(pn->params.serial); 9279 n->subsys = pn->subsys; 9280 9281 /* 9282 * Assigning this link (strong link) causes an `object_unref` later in 9283 * `object_release_link_property`. Increment the refcount to balance 9284 * this out. 9285 */ 9286 object_ref(OBJECT(pn->subsys)); 9287 } 9288 9289 if (!nvme_check_params(n, errp)) { 9290 return; 9291 } 9292 9293 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id); 9294 9295 if (nvme_init_subsys(n, errp)) { 9296 return; 9297 } 9298 nvme_init_state(n); 9299 if (!nvme_init_pci(n, pci_dev, errp)) { 9300 return; 9301 } 9302 nvme_init_ctrl(n, pci_dev); 9303 9304 /* setup a namespace if the controller drive property was given */ 9305 if (n->namespace.blkconf.blk) { 9306 ns = &n->namespace; 9307 ns->params.nsid = 1; 9308 ns->ctrl = n; 9309 9310 if (nvme_ns_setup(ns, errp)) { 9311 return; 9312 } 9313 9314 n->subsys->namespaces[ns->params.nsid] = ns; 9315 } 9316 } 9317 9318 static void nvme_exit(PCIDevice *pci_dev) 9319 { 9320 NvmeCtrl *n = NVME(pci_dev); 9321 NvmeNamespace *ns; 9322 int i; 9323 9324 nvme_ctrl_reset(n, NVME_RESET_FUNCTION); 9325 9326 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 9327 ns = nvme_ns(n, i); 9328 if (ns) { 9329 ns->attached--; 9330 } 9331 } 9332 9333 nvme_subsys_unregister_ctrl(n->subsys, n); 9334 9335 g_free(n->cq); 9336 g_free(n->sq); 9337 g_free(n->aer_reqs); 9338 9339 if (n->params.cmb_size_mb) { 9340 g_free(n->cmb.buf); 9341 } 9342 9343 /* Only one of the `spdm_socket`s below should have been setup */ 9344 assert(!(pci_dev->doe_spdm.spdm_socket > 0 && n->spdm_socket >= 0)); 9345 if (pci_dev->doe_spdm.spdm_socket > 0) { 9346 spdm_socket_close(pci_dev->doe_spdm.spdm_socket, 9347 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE); 9348 } else if (n->spdm_socket >= 0) { 9349 spdm_socket_close(pci_dev->doe_spdm.spdm_socket, 9350 SPDM_SOCKET_TRANSPORT_TYPE_NVME); 9351 } 9352 9353 if (n->pmr.dev) { 9354 host_memory_backend_set_mapped(n->pmr.dev, false); 9355 } 9356 9357 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) { 9358 pcie_sriov_pf_exit(pci_dev); 9359 } 9360 9361 if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) { 9362 msix_uninit_exclusive_bar(pci_dev); 9363 } else { 9364 msix_uninit(pci_dev, &n->bar0, &n->bar0); 9365 } 9366 9367 memory_region_del_subregion(&n->bar0, &n->iomem); 9368 } 9369 9370 static const Property nvme_props[] = { 9371 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf), 9372 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND, 9373 HostMemoryBackend *), 9374 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS, 9375 NvmeSubsystem *), 9376 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial), 9377 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0), 9378 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0), 9379 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64), 9380 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65), 9381 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3), 9382 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64), 9383 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7), 9384 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7), 9385 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), 9386 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), 9387 DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false), 9388 DEFINE_PROP_BOOL("dbcs", NvmeCtrl, params.dbcs, true), 9389 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), 9390 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl, 9391 params.auto_transition_zones, true), 9392 DEFINE_PROP_UINT16("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0), 9393 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl, 9394 params.sriov_vq_flexible, 0), 9395 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl, 9396 params.sriov_vi_flexible, 0), 9397 DEFINE_PROP_UINT32("sriov_max_vi_per_vf", NvmeCtrl, 9398 params.sriov_max_vi_per_vf, 0), 9399 DEFINE_PROP_UINT32("sriov_max_vq_per_vf", NvmeCtrl, 9400 params.sriov_max_vq_per_vf, 0), 9401 DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar, 9402 false), 9403 DEFINE_PROP_UINT16("mqes", NvmeCtrl, params.mqes, 0x7ff), 9404 DEFINE_PROP_UINT16("spdm_port", PCIDevice, spdm_port, 0), 9405 DEFINE_PROP_SPDM_TRANS("spdm_trans", PCIDevice, spdm_trans, 9406 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE), 9407 DEFINE_PROP_BOOL("ctratt.mem", NvmeCtrl, params.ctratt.mem, false), 9408 DEFINE_PROP_BOOL("atomic.dn", NvmeCtrl, params.atomic_dn, 0), 9409 DEFINE_PROP_UINT16("atomic.awun", NvmeCtrl, params.atomic_awun, 0), 9410 DEFINE_PROP_UINT16("atomic.awupf", NvmeCtrl, params.atomic_awupf, 0), 9411 DEFINE_PROP_BOOL("ocp", NvmeCtrl, params.ocp, false), 9412 }; 9413 9414 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name, 9415 void *opaque, Error **errp) 9416 { 9417 NvmeCtrl *n = NVME(obj); 9418 uint8_t value = n->smart_critical_warning; 9419 9420 visit_type_uint8(v, name, &value, errp); 9421 } 9422 9423 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name, 9424 void *opaque, Error **errp) 9425 { 9426 NvmeCtrl *n = NVME(obj); 9427 uint8_t value, old_value, cap = 0, index, event; 9428 9429 if (!visit_type_uint8(v, name, &value, errp)) { 9430 return; 9431 } 9432 9433 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY 9434 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA; 9435 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) { 9436 cap |= NVME_SMART_PMR_UNRELIABLE; 9437 } 9438 9439 if ((value & cap) != value) { 9440 error_setg(errp, "unsupported smart critical warning bits: 0x%x", 9441 value & ~cap); 9442 return; 9443 } 9444 9445 old_value = n->smart_critical_warning; 9446 n->smart_critical_warning = value; 9447 9448 /* only inject new bits of smart critical warning */ 9449 for (index = 0; index < NVME_SMART_WARN_MAX; index++) { 9450 event = 1 << index; 9451 if (value & ~old_value & event) 9452 nvme_smart_event(n, event); 9453 } 9454 } 9455 9456 static void nvme_pci_reset(DeviceState *qdev) 9457 { 9458 PCIDevice *pci_dev = PCI_DEVICE(qdev); 9459 NvmeCtrl *n = NVME(pci_dev); 9460 9461 trace_pci_nvme_pci_reset(); 9462 nvme_ctrl_reset(n, NVME_RESET_FUNCTION); 9463 } 9464 9465 static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs) 9466 { 9467 NvmeCtrl *n = NVME(dev); 9468 NvmeSecCtrlEntry *sctrl; 9469 int i; 9470 9471 for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) { 9472 sctrl = &n->sec_ctrl_list[i]; 9473 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); 9474 } 9475 } 9476 9477 static void nvme_pci_write_config(PCIDevice *dev, uint32_t address, 9478 uint32_t val, int len) 9479 { 9480 uint16_t old_num_vfs = pcie_sriov_num_vfs(dev); 9481 9482 /* DOE is only initialised if SPDM over DOE is used */ 9483 if (pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE) && 9484 dev->spdm_trans == SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE) { 9485 pcie_doe_write_config(&dev->doe_spdm, address, val, len); 9486 } 9487 pci_default_write_config(dev, address, val, len); 9488 pcie_cap_flr_write_config(dev, address, val, len); 9489 nvme_sriov_post_write_config(dev, old_num_vfs); 9490 } 9491 9492 static uint32_t nvme_pci_read_config(PCIDevice *dev, uint32_t address, int len) 9493 { 9494 uint32_t val; 9495 9496 if (dev->spdm_port && pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE) && 9497 (dev->spdm_trans == SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE)) { 9498 if (pcie_doe_read_config(&dev->doe_spdm, address, len, &val)) { 9499 return val; 9500 } 9501 } 9502 return pci_default_read_config(dev, address, len); 9503 } 9504 9505 static const VMStateDescription nvme_vmstate = { 9506 .name = "nvme", 9507 .unmigratable = 1, 9508 }; 9509 9510 static void nvme_class_init(ObjectClass *oc, const void *data) 9511 { 9512 DeviceClass *dc = DEVICE_CLASS(oc); 9513 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc); 9514 9515 pc->realize = nvme_realize; 9516 pc->config_write = nvme_pci_write_config; 9517 pc->config_read = nvme_pci_read_config; 9518 pc->exit = nvme_exit; 9519 pc->class_id = PCI_CLASS_STORAGE_EXPRESS; 9520 pc->revision = 2; 9521 9522 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); 9523 dc->desc = "Non-Volatile Memory Express"; 9524 device_class_set_props(dc, nvme_props); 9525 dc->vmsd = &nvme_vmstate; 9526 device_class_set_legacy_reset(dc, nvme_pci_reset); 9527 } 9528 9529 static void nvme_instance_init(Object *obj) 9530 { 9531 NvmeCtrl *n = NVME(obj); 9532 9533 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex, 9534 "bootindex", "/namespace@1,0", 9535 DEVICE(obj)); 9536 9537 object_property_add(obj, "smart_critical_warning", "uint8", 9538 nvme_get_smart_warning, 9539 nvme_set_smart_warning, NULL, NULL); 9540 } 9541 9542 static const TypeInfo nvme_info = { 9543 .name = TYPE_NVME, 9544 .parent = TYPE_PCI_DEVICE, 9545 .instance_size = sizeof(NvmeCtrl), 9546 .instance_init = nvme_instance_init, 9547 .class_init = nvme_class_init, 9548 .interfaces = (const InterfaceInfo[]) { 9549 { INTERFACE_PCIE_DEVICE }, 9550 { } 9551 }, 9552 }; 9553 9554 static const TypeInfo nvme_bus_info = { 9555 .name = TYPE_NVME_BUS, 9556 .parent = TYPE_BUS, 9557 .instance_size = sizeof(NvmeBus), 9558 }; 9559 9560 static void nvme_register_types(void) 9561 { 9562 type_register_static(&nvme_info); 9563 type_register_static(&nvme_bus_info); 9564 } 9565 9566 type_init(nvme_register_types) 9567