1 /* 2 * QEMU NVM Express Controller 3 * 4 * Copyright (c) 2012, Intel Corporation 5 * 6 * Written by Keith Busch <keith.busch@intel.com> 7 * 8 * This code is licensed under the GNU GPL v2 or later. 9 */ 10 11 /** 12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e 13 * 14 * https://nvmexpress.org/developers/nvme-specification/ 15 * 16 * 17 * Notes on coding style 18 * --------------------- 19 * While QEMU coding style prefers lowercase hexadecimals in constants, the 20 * NVMe subsystem use thes format from the NVMe specifications in the comments 21 * (i.e. 'h' suffix instead of '0x' prefix). 22 * 23 * Usage 24 * ----- 25 * See docs/system/nvme.rst for extensive documentation. 26 * 27 * Add options: 28 * -drive file=<file>,if=none,id=<drive_id> 29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id> 30 * -device nvme,serial=<serial>,id=<bus_name>, \ 31 * cmb_size_mb=<cmb_size_mb[optional]>, \ 32 * [pmrdev=<mem_backend_file_id>,] \ 33 * max_ioqpairs=<N[optional]>, \ 34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \ 35 * mdts=<N[optional]>,vsl=<N[optional]>, \ 36 * zoned.zasl=<N[optional]>, \ 37 * zoned.auto_transition=<on|off[optional]>, \ 38 * subsys=<subsys_id> 39 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\ 40 * zoned=<true|false[optional]>, \ 41 * subsys=<subsys_id>,detached=<true|false[optional]> 42 * 43 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at 44 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the 45 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to 46 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior). 47 * 48 * Enabling pmr emulation can be achieved by pointing to memory-backend-file. 49 * For example: 50 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \ 51 * size=<size> .... -device nvme,...,pmrdev=<mem_id> 52 * 53 * The PMR will use BAR 4/5 exclusively. 54 * 55 * To place controller(s) and namespace(s) to a subsystem, then provide 56 * nvme-subsys device as above. 57 * 58 * nvme subsystem device parameters 59 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 60 * - `nqn` 61 * This parameter provides the `<nqn_id>` part of the string 62 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field 63 * of subsystem controllers. Note that `<nqn_id>` should be unique per 64 * subsystem, but this is not enforced by QEMU. If not specified, it will 65 * default to the value of the `id` parameter (`<subsys_id>`). 66 * 67 * nvme device parameters 68 * ~~~~~~~~~~~~~~~~~~~~~~ 69 * - `subsys` 70 * Specifying this parameter attaches the controller to the subsystem and 71 * the SUBNQN field in the controller will report the NQN of the subsystem 72 * device. This also enables multi controller capability represented in 73 * Identify Controller data structure in CMIC (Controller Multi-path I/O and 74 * Namesapce Sharing Capabilities). 75 * 76 * - `aerl` 77 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number 78 * of concurrently outstanding Asynchronous Event Request commands support 79 * by the controller. This is a 0's based value. 80 * 81 * - `aer_max_queued` 82 * This is the maximum number of events that the device will enqueue for 83 * completion when there are no outstanding AERs. When the maximum number of 84 * enqueued events are reached, subsequent events will be dropped. 85 * 86 * - `mdts` 87 * Indicates the maximum data transfer size for a command that transfers data 88 * between host-accessible memory and the controller. The value is specified 89 * as a power of two (2^n) and is in units of the minimum memory page size 90 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB). 91 * 92 * - `vsl` 93 * Indicates the maximum data size limit for the Verify command. Like `mdts`, 94 * this value is specified as a power of two (2^n) and is in units of the 95 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512 96 * KiB). 97 * 98 * - `zoned.zasl` 99 * Indicates the maximum data transfer size for the Zone Append command. Like 100 * `mdts`, the value is specified as a power of two (2^n) and is in units of 101 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e. 102 * defaulting to the value of `mdts`). 103 * 104 * - `zoned.auto_transition` 105 * Indicates if zones in zone state implicitly opened can be automatically 106 * transitioned to zone state closed for resource management purposes. 107 * Defaults to 'on'. 108 * 109 * nvme namespace device parameters 110 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 111 * - `shared` 112 * When the parent nvme device (as defined explicitly by the 'bus' parameter 113 * or implicitly by the most recently defined NvmeBus) is linked to an 114 * nvme-subsys device, the namespace will be attached to all controllers in 115 * the subsystem. If set to 'off' (the default), the namespace will remain a 116 * private namespace and may only be attached to a single controller at a 117 * time. 118 * 119 * - `detached` 120 * This parameter is only valid together with the `subsys` parameter. If left 121 * at the default value (`false/off`), the namespace will be attached to all 122 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the 123 * namespace will be be available in the subsystem not not attached to any 124 * controllers. 125 * 126 * Setting `zoned` to true selects Zoned Command Set at the namespace. 127 * In this case, the following namespace properties are available to configure 128 * zoned operation: 129 * zoned.zone_size=<zone size in bytes, default: 128MiB> 130 * The number may be followed by K, M, G as in kilo-, mega- or giga-. 131 * 132 * zoned.zone_capacity=<zone capacity in bytes, default: zone size> 133 * The value 0 (default) forces zone capacity to be the same as zone 134 * size. The value of this property may not exceed zone size. 135 * 136 * zoned.descr_ext_size=<zone descriptor extension size, default 0> 137 * This value needs to be specified in 64B units. If it is zero, 138 * namespace(s) will not support zone descriptor extensions. 139 * 140 * zoned.max_active=<Maximum Active Resources (zones), default: 0> 141 * The default value means there is no limit to the number of 142 * concurrently active zones. 143 * 144 * zoned.max_open=<Maximum Open Resources (zones), default: 0> 145 * The default value means there is no limit to the number of 146 * concurrently open zones. 147 * 148 * zoned.cross_read=<enable RAZB, default: false> 149 * Setting this property to true enables Read Across Zone Boundaries. 150 */ 151 152 #include "qemu/osdep.h" 153 #include "qemu/cutils.h" 154 #include "qemu/error-report.h" 155 #include "qemu/log.h" 156 #include "qemu/units.h" 157 #include "qapi/error.h" 158 #include "qapi/visitor.h" 159 #include "sysemu/sysemu.h" 160 #include "sysemu/block-backend.h" 161 #include "sysemu/hostmem.h" 162 #include "hw/pci/msix.h" 163 #include "migration/vmstate.h" 164 165 #include "nvme.h" 166 #include "trace.h" 167 168 #define NVME_MAX_IOQPAIRS 0xffff 169 #define NVME_DB_SIZE 4 170 #define NVME_SPEC_VER 0x00010400 171 #define NVME_CMB_BIR 2 172 #define NVME_PMR_BIR 4 173 #define NVME_TEMPERATURE 0x143 174 #define NVME_TEMPERATURE_WARNING 0x157 175 #define NVME_TEMPERATURE_CRITICAL 0x175 176 #define NVME_NUM_FW_SLOTS 1 177 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB) 178 179 #define NVME_GUEST_ERR(trace, fmt, ...) \ 180 do { \ 181 (trace_##trace)(__VA_ARGS__); \ 182 qemu_log_mask(LOG_GUEST_ERROR, #trace \ 183 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \ 184 } while (0) 185 186 static const bool nvme_feature_support[NVME_FID_MAX] = { 187 [NVME_ARBITRATION] = true, 188 [NVME_POWER_MANAGEMENT] = true, 189 [NVME_TEMPERATURE_THRESHOLD] = true, 190 [NVME_ERROR_RECOVERY] = true, 191 [NVME_VOLATILE_WRITE_CACHE] = true, 192 [NVME_NUMBER_OF_QUEUES] = true, 193 [NVME_INTERRUPT_COALESCING] = true, 194 [NVME_INTERRUPT_VECTOR_CONF] = true, 195 [NVME_WRITE_ATOMICITY] = true, 196 [NVME_ASYNCHRONOUS_EVENT_CONF] = true, 197 [NVME_TIMESTAMP] = true, 198 [NVME_COMMAND_SET_PROFILE] = true, 199 }; 200 201 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = { 202 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE, 203 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS, 204 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE, 205 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE, 206 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE, 207 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE, 208 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE, 209 }; 210 211 static const uint32_t nvme_cse_acs[256] = { 212 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP, 213 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP, 214 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP, 215 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP, 216 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP, 217 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP, 218 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP, 219 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP, 220 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP, 221 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, 222 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC, 223 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 224 }; 225 226 static const uint32_t nvme_cse_iocs_none[256]; 227 228 static const uint32_t nvme_cse_iocs_nvm[256] = { 229 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 230 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 231 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 232 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, 233 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 234 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP, 235 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 236 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, 237 }; 238 239 static const uint32_t nvme_cse_iocs_zoned[256] = { 240 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 241 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 242 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 243 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, 244 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 245 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP, 246 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 247 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, 248 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 249 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 250 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP, 251 }; 252 253 static void nvme_process_sq(void *opaque); 254 255 static uint16_t nvme_sqid(NvmeRequest *req) 256 { 257 return le16_to_cpu(req->sq->sqid); 258 } 259 260 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone, 261 NvmeZoneState state) 262 { 263 if (QTAILQ_IN_USE(zone, entry)) { 264 switch (nvme_get_zone_state(zone)) { 265 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 266 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry); 267 break; 268 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 269 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); 270 break; 271 case NVME_ZONE_STATE_CLOSED: 272 QTAILQ_REMOVE(&ns->closed_zones, zone, entry); 273 break; 274 case NVME_ZONE_STATE_FULL: 275 QTAILQ_REMOVE(&ns->full_zones, zone, entry); 276 default: 277 ; 278 } 279 } 280 281 nvme_set_zone_state(zone, state); 282 283 switch (state) { 284 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 285 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry); 286 break; 287 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 288 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry); 289 break; 290 case NVME_ZONE_STATE_CLOSED: 291 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry); 292 break; 293 case NVME_ZONE_STATE_FULL: 294 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry); 295 case NVME_ZONE_STATE_READ_ONLY: 296 break; 297 default: 298 zone->d.za = 0; 299 } 300 } 301 302 /* 303 * Check if we can open a zone without exceeding open/active limits. 304 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5). 305 */ 306 static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn) 307 { 308 if (ns->params.max_active_zones != 0 && 309 ns->nr_active_zones + act > ns->params.max_active_zones) { 310 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones); 311 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR; 312 } 313 if (ns->params.max_open_zones != 0 && 314 ns->nr_open_zones + opn > ns->params.max_open_zones) { 315 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones); 316 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR; 317 } 318 319 return NVME_SUCCESS; 320 } 321 322 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) 323 { 324 hwaddr hi, lo; 325 326 if (!n->cmb.cmse) { 327 return false; 328 } 329 330 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba; 331 hi = lo + int128_get64(n->cmb.mem.size); 332 333 return addr >= lo && addr < hi; 334 } 335 336 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) 337 { 338 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba; 339 return &n->cmb.buf[addr - base]; 340 } 341 342 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr) 343 { 344 hwaddr hi; 345 346 if (!n->pmr.cmse) { 347 return false; 348 } 349 350 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size); 351 352 return addr >= n->pmr.cba && addr < hi; 353 } 354 355 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr) 356 { 357 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba); 358 } 359 360 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) 361 { 362 hwaddr hi = addr + size - 1; 363 if (hi < addr) { 364 return 1; 365 } 366 367 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) { 368 memcpy(buf, nvme_addr_to_cmb(n, addr), size); 369 return 0; 370 } 371 372 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) { 373 memcpy(buf, nvme_addr_to_pmr(n, addr), size); 374 return 0; 375 } 376 377 return pci_dma_read(&n->parent_obj, addr, buf, size); 378 } 379 380 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size) 381 { 382 hwaddr hi = addr + size - 1; 383 if (hi < addr) { 384 return 1; 385 } 386 387 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) { 388 memcpy(nvme_addr_to_cmb(n, addr), buf, size); 389 return 0; 390 } 391 392 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) { 393 memcpy(nvme_addr_to_pmr(n, addr), buf, size); 394 return 0; 395 } 396 397 return pci_dma_write(&n->parent_obj, addr, buf, size); 398 } 399 400 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid) 401 { 402 return nsid && 403 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES); 404 } 405 406 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid) 407 { 408 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1; 409 } 410 411 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid) 412 { 413 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1; 414 } 415 416 static void nvme_inc_cq_tail(NvmeCQueue *cq) 417 { 418 cq->tail++; 419 if (cq->tail >= cq->size) { 420 cq->tail = 0; 421 cq->phase = !cq->phase; 422 } 423 } 424 425 static void nvme_inc_sq_head(NvmeSQueue *sq) 426 { 427 sq->head = (sq->head + 1) % sq->size; 428 } 429 430 static uint8_t nvme_cq_full(NvmeCQueue *cq) 431 { 432 return (cq->tail + 1) % cq->size == cq->head; 433 } 434 435 static uint8_t nvme_sq_empty(NvmeSQueue *sq) 436 { 437 return sq->head == sq->tail; 438 } 439 440 static void nvme_irq_check(NvmeCtrl *n) 441 { 442 if (msix_enabled(&(n->parent_obj))) { 443 return; 444 } 445 if (~n->bar.intms & n->irq_status) { 446 pci_irq_assert(&n->parent_obj); 447 } else { 448 pci_irq_deassert(&n->parent_obj); 449 } 450 } 451 452 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq) 453 { 454 if (cq->irq_enabled) { 455 if (msix_enabled(&(n->parent_obj))) { 456 trace_pci_nvme_irq_msix(cq->vector); 457 msix_notify(&(n->parent_obj), cq->vector); 458 } else { 459 trace_pci_nvme_irq_pin(); 460 assert(cq->vector < 32); 461 n->irq_status |= 1 << cq->vector; 462 nvme_irq_check(n); 463 } 464 } else { 465 trace_pci_nvme_irq_masked(); 466 } 467 } 468 469 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq) 470 { 471 if (cq->irq_enabled) { 472 if (msix_enabled(&(n->parent_obj))) { 473 return; 474 } else { 475 assert(cq->vector < 32); 476 n->irq_status &= ~(1 << cq->vector); 477 nvme_irq_check(n); 478 } 479 } 480 } 481 482 static void nvme_req_clear(NvmeRequest *req) 483 { 484 req->ns = NULL; 485 req->opaque = NULL; 486 req->aiocb = NULL; 487 memset(&req->cqe, 0x0, sizeof(req->cqe)); 488 req->status = NVME_SUCCESS; 489 } 490 491 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma) 492 { 493 if (dma) { 494 pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0); 495 sg->flags = NVME_SG_DMA; 496 } else { 497 qemu_iovec_init(&sg->iov, 0); 498 } 499 500 sg->flags |= NVME_SG_ALLOC; 501 } 502 503 static inline void nvme_sg_unmap(NvmeSg *sg) 504 { 505 if (!(sg->flags & NVME_SG_ALLOC)) { 506 return; 507 } 508 509 if (sg->flags & NVME_SG_DMA) { 510 qemu_sglist_destroy(&sg->qsg); 511 } else { 512 qemu_iovec_destroy(&sg->iov); 513 } 514 515 memset(sg, 0x0, sizeof(*sg)); 516 } 517 518 /* 519 * When metadata is transfered as extended LBAs, the DPTR mapped into `sg` 520 * holds both data and metadata. This function splits the data and metadata 521 * into two separate QSG/IOVs. 522 */ 523 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data, 524 NvmeSg *mdata) 525 { 526 NvmeSg *dst = data; 527 uint32_t trans_len, count = ns->lbasz; 528 uint64_t offset = 0; 529 bool dma = sg->flags & NVME_SG_DMA; 530 size_t sge_len; 531 size_t sg_len = dma ? sg->qsg.size : sg->iov.size; 532 int sg_idx = 0; 533 534 assert(sg->flags & NVME_SG_ALLOC); 535 536 while (sg_len) { 537 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len; 538 539 trans_len = MIN(sg_len, count); 540 trans_len = MIN(trans_len, sge_len - offset); 541 542 if (dst) { 543 if (dma) { 544 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset, 545 trans_len); 546 } else { 547 qemu_iovec_add(&dst->iov, 548 sg->iov.iov[sg_idx].iov_base + offset, 549 trans_len); 550 } 551 } 552 553 sg_len -= trans_len; 554 count -= trans_len; 555 offset += trans_len; 556 557 if (count == 0) { 558 dst = (dst == data) ? mdata : data; 559 count = (dst == data) ? ns->lbasz : ns->lbaf.ms; 560 } 561 562 if (sge_len == offset) { 563 offset = 0; 564 sg_idx++; 565 } 566 } 567 } 568 569 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, 570 size_t len) 571 { 572 if (!len) { 573 return NVME_SUCCESS; 574 } 575 576 trace_pci_nvme_map_addr_cmb(addr, len); 577 578 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) { 579 return NVME_DATA_TRAS_ERROR; 580 } 581 582 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len); 583 584 return NVME_SUCCESS; 585 } 586 587 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, 588 size_t len) 589 { 590 if (!len) { 591 return NVME_SUCCESS; 592 } 593 594 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) { 595 return NVME_DATA_TRAS_ERROR; 596 } 597 598 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len); 599 600 return NVME_SUCCESS; 601 } 602 603 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len) 604 { 605 bool cmb = false, pmr = false; 606 607 if (!len) { 608 return NVME_SUCCESS; 609 } 610 611 trace_pci_nvme_map_addr(addr, len); 612 613 if (nvme_addr_is_cmb(n, addr)) { 614 cmb = true; 615 } else if (nvme_addr_is_pmr(n, addr)) { 616 pmr = true; 617 } 618 619 if (cmb || pmr) { 620 if (sg->flags & NVME_SG_DMA) { 621 return NVME_INVALID_USE_OF_CMB | NVME_DNR; 622 } 623 624 if (cmb) { 625 return nvme_map_addr_cmb(n, &sg->iov, addr, len); 626 } else { 627 return nvme_map_addr_pmr(n, &sg->iov, addr, len); 628 } 629 } 630 631 if (!(sg->flags & NVME_SG_DMA)) { 632 return NVME_INVALID_USE_OF_CMB | NVME_DNR; 633 } 634 635 qemu_sglist_add(&sg->qsg, addr, len); 636 637 return NVME_SUCCESS; 638 } 639 640 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr) 641 { 642 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr)); 643 } 644 645 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1, 646 uint64_t prp2, uint32_t len) 647 { 648 hwaddr trans_len = n->page_size - (prp1 % n->page_size); 649 trans_len = MIN(len, trans_len); 650 int num_prps = (len >> n->page_bits) + 1; 651 uint16_t status; 652 int ret; 653 654 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); 655 656 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1)); 657 658 status = nvme_map_addr(n, sg, prp1, trans_len); 659 if (status) { 660 goto unmap; 661 } 662 663 len -= trans_len; 664 if (len) { 665 if (len > n->page_size) { 666 uint64_t prp_list[n->max_prp_ents]; 667 uint32_t nents, prp_trans; 668 int i = 0; 669 670 /* 671 * The first PRP list entry, pointed to by PRP2 may contain offset. 672 * Hence, we need to calculate the number of entries in based on 673 * that offset. 674 */ 675 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3; 676 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); 677 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); 678 if (ret) { 679 trace_pci_nvme_err_addr_read(prp2); 680 status = NVME_DATA_TRAS_ERROR; 681 goto unmap; 682 } 683 while (len != 0) { 684 uint64_t prp_ent = le64_to_cpu(prp_list[i]); 685 686 if (i == nents - 1 && len > n->page_size) { 687 if (unlikely(prp_ent & (n->page_size - 1))) { 688 trace_pci_nvme_err_invalid_prplist_ent(prp_ent); 689 status = NVME_INVALID_PRP_OFFSET | NVME_DNR; 690 goto unmap; 691 } 692 693 i = 0; 694 nents = (len + n->page_size - 1) >> n->page_bits; 695 nents = MIN(nents, n->max_prp_ents); 696 prp_trans = nents * sizeof(uint64_t); 697 ret = nvme_addr_read(n, prp_ent, (void *)prp_list, 698 prp_trans); 699 if (ret) { 700 trace_pci_nvme_err_addr_read(prp_ent); 701 status = NVME_DATA_TRAS_ERROR; 702 goto unmap; 703 } 704 prp_ent = le64_to_cpu(prp_list[i]); 705 } 706 707 if (unlikely(prp_ent & (n->page_size - 1))) { 708 trace_pci_nvme_err_invalid_prplist_ent(prp_ent); 709 status = NVME_INVALID_PRP_OFFSET | NVME_DNR; 710 goto unmap; 711 } 712 713 trans_len = MIN(len, n->page_size); 714 status = nvme_map_addr(n, sg, prp_ent, trans_len); 715 if (status) { 716 goto unmap; 717 } 718 719 len -= trans_len; 720 i++; 721 } 722 } else { 723 if (unlikely(prp2 & (n->page_size - 1))) { 724 trace_pci_nvme_err_invalid_prp2_align(prp2); 725 status = NVME_INVALID_PRP_OFFSET | NVME_DNR; 726 goto unmap; 727 } 728 status = nvme_map_addr(n, sg, prp2, len); 729 if (status) { 730 goto unmap; 731 } 732 } 733 } 734 735 return NVME_SUCCESS; 736 737 unmap: 738 nvme_sg_unmap(sg); 739 return status; 740 } 741 742 /* 743 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the 744 * number of bytes mapped in len. 745 */ 746 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg, 747 NvmeSglDescriptor *segment, uint64_t nsgld, 748 size_t *len, NvmeCmd *cmd) 749 { 750 dma_addr_t addr, trans_len; 751 uint32_t dlen; 752 uint16_t status; 753 754 for (int i = 0; i < nsgld; i++) { 755 uint8_t type = NVME_SGL_TYPE(segment[i].type); 756 757 switch (type) { 758 case NVME_SGL_DESCR_TYPE_BIT_BUCKET: 759 if (cmd->opcode == NVME_CMD_WRITE) { 760 continue; 761 } 762 case NVME_SGL_DESCR_TYPE_DATA_BLOCK: 763 break; 764 case NVME_SGL_DESCR_TYPE_SEGMENT: 765 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: 766 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR; 767 default: 768 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR; 769 } 770 771 dlen = le32_to_cpu(segment[i].len); 772 773 if (!dlen) { 774 continue; 775 } 776 777 if (*len == 0) { 778 /* 779 * All data has been mapped, but the SGL contains additional 780 * segments and/or descriptors. The controller might accept 781 * ignoring the rest of the SGL. 782 */ 783 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls); 784 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) { 785 break; 786 } 787 788 trace_pci_nvme_err_invalid_sgl_excess_length(dlen); 789 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 790 } 791 792 trans_len = MIN(*len, dlen); 793 794 if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) { 795 goto next; 796 } 797 798 addr = le64_to_cpu(segment[i].addr); 799 800 if (UINT64_MAX - addr < dlen) { 801 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 802 } 803 804 status = nvme_map_addr(n, sg, addr, trans_len); 805 if (status) { 806 return status; 807 } 808 809 next: 810 *len -= trans_len; 811 } 812 813 return NVME_SUCCESS; 814 } 815 816 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, 817 size_t len, NvmeCmd *cmd) 818 { 819 /* 820 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid 821 * dynamically allocating a potentially huge SGL. The spec allows the SGL 822 * to be larger (as in number of bytes required to describe the SGL 823 * descriptors and segment chain) than the command transfer size, so it is 824 * not bounded by MDTS. 825 */ 826 const int SEG_CHUNK_SIZE = 256; 827 828 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld; 829 uint64_t nsgld; 830 uint32_t seg_len; 831 uint16_t status; 832 hwaddr addr; 833 int ret; 834 835 sgld = &sgl; 836 addr = le64_to_cpu(sgl.addr); 837 838 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len); 839 840 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr)); 841 842 /* 843 * If the entire transfer can be described with a single data block it can 844 * be mapped directly. 845 */ 846 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { 847 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd); 848 if (status) { 849 goto unmap; 850 } 851 852 goto out; 853 } 854 855 for (;;) { 856 switch (NVME_SGL_TYPE(sgld->type)) { 857 case NVME_SGL_DESCR_TYPE_SEGMENT: 858 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: 859 break; 860 default: 861 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; 862 } 863 864 seg_len = le32_to_cpu(sgld->len); 865 866 /* check the length of the (Last) Segment descriptor */ 867 if ((!seg_len || seg_len & 0xf) && 868 (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) { 869 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; 870 } 871 872 if (UINT64_MAX - addr < seg_len) { 873 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 874 } 875 876 nsgld = seg_len / sizeof(NvmeSglDescriptor); 877 878 while (nsgld > SEG_CHUNK_SIZE) { 879 if (nvme_addr_read(n, addr, segment, sizeof(segment))) { 880 trace_pci_nvme_err_addr_read(addr); 881 status = NVME_DATA_TRAS_ERROR; 882 goto unmap; 883 } 884 885 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE, 886 &len, cmd); 887 if (status) { 888 goto unmap; 889 } 890 891 nsgld -= SEG_CHUNK_SIZE; 892 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor); 893 } 894 895 ret = nvme_addr_read(n, addr, segment, nsgld * 896 sizeof(NvmeSglDescriptor)); 897 if (ret) { 898 trace_pci_nvme_err_addr_read(addr); 899 status = NVME_DATA_TRAS_ERROR; 900 goto unmap; 901 } 902 903 last_sgld = &segment[nsgld - 1]; 904 905 /* 906 * If the segment ends with a Data Block or Bit Bucket Descriptor Type, 907 * then we are done. 908 */ 909 switch (NVME_SGL_TYPE(last_sgld->type)) { 910 case NVME_SGL_DESCR_TYPE_DATA_BLOCK: 911 case NVME_SGL_DESCR_TYPE_BIT_BUCKET: 912 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd); 913 if (status) { 914 goto unmap; 915 } 916 917 goto out; 918 919 default: 920 break; 921 } 922 923 /* 924 * If the last descriptor was not a Data Block or Bit Bucket, then the 925 * current segment must not be a Last Segment. 926 */ 927 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) { 928 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; 929 goto unmap; 930 } 931 932 sgld = last_sgld; 933 addr = le64_to_cpu(sgld->addr); 934 935 /* 936 * Do not map the last descriptor; it will be a Segment or Last Segment 937 * descriptor and is handled by the next iteration. 938 */ 939 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd); 940 if (status) { 941 goto unmap; 942 } 943 } 944 945 out: 946 /* if there is any residual left in len, the SGL was too short */ 947 if (len) { 948 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 949 goto unmap; 950 } 951 952 return NVME_SUCCESS; 953 954 unmap: 955 nvme_sg_unmap(sg); 956 return status; 957 } 958 959 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len, 960 NvmeCmd *cmd) 961 { 962 uint64_t prp1, prp2; 963 964 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) { 965 case NVME_PSDT_PRP: 966 prp1 = le64_to_cpu(cmd->dptr.prp1); 967 prp2 = le64_to_cpu(cmd->dptr.prp2); 968 969 return nvme_map_prp(n, sg, prp1, prp2, len); 970 case NVME_PSDT_SGL_MPTR_CONTIGUOUS: 971 case NVME_PSDT_SGL_MPTR_SGL: 972 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd); 973 default: 974 return NVME_INVALID_FIELD; 975 } 976 } 977 978 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len, 979 NvmeCmd *cmd) 980 { 981 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags); 982 hwaddr mptr = le64_to_cpu(cmd->mptr); 983 uint16_t status; 984 985 if (psdt == NVME_PSDT_SGL_MPTR_SGL) { 986 NvmeSglDescriptor sgl; 987 988 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) { 989 return NVME_DATA_TRAS_ERROR; 990 } 991 992 status = nvme_map_sgl(n, sg, sgl, len, cmd); 993 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) { 994 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR; 995 } 996 997 return status; 998 } 999 1000 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr)); 1001 status = nvme_map_addr(n, sg, mptr, len); 1002 if (status) { 1003 nvme_sg_unmap(sg); 1004 } 1005 1006 return status; 1007 } 1008 1009 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req) 1010 { 1011 NvmeNamespace *ns = req->ns; 1012 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1013 uint16_t ctrl = le16_to_cpu(rw->control); 1014 size_t len = nvme_l2b(ns, nlb); 1015 uint16_t status; 1016 1017 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && 1018 (ctrl & NVME_RW_PRINFO_PRACT && ns->lbaf.ms == 8)) { 1019 goto out; 1020 } 1021 1022 if (nvme_ns_ext(ns)) { 1023 NvmeSg sg; 1024 1025 len += nvme_m2b(ns, nlb); 1026 1027 status = nvme_map_dptr(n, &sg, len, &req->cmd); 1028 if (status) { 1029 return status; 1030 } 1031 1032 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA); 1033 nvme_sg_split(&sg, ns, &req->sg, NULL); 1034 nvme_sg_unmap(&sg); 1035 1036 return NVME_SUCCESS; 1037 } 1038 1039 out: 1040 return nvme_map_dptr(n, &req->sg, len, &req->cmd); 1041 } 1042 1043 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req) 1044 { 1045 NvmeNamespace *ns = req->ns; 1046 size_t len = nvme_m2b(ns, nlb); 1047 uint16_t status; 1048 1049 if (nvme_ns_ext(ns)) { 1050 NvmeSg sg; 1051 1052 len += nvme_l2b(ns, nlb); 1053 1054 status = nvme_map_dptr(n, &sg, len, &req->cmd); 1055 if (status) { 1056 return status; 1057 } 1058 1059 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA); 1060 nvme_sg_split(&sg, ns, NULL, &req->sg); 1061 nvme_sg_unmap(&sg); 1062 1063 return NVME_SUCCESS; 1064 } 1065 1066 return nvme_map_mptr(n, &req->sg, len, &req->cmd); 1067 } 1068 1069 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, 1070 uint32_t len, uint32_t bytes, 1071 int32_t skip_bytes, int64_t offset, 1072 NvmeTxDirection dir) 1073 { 1074 hwaddr addr; 1075 uint32_t trans_len, count = bytes; 1076 bool dma = sg->flags & NVME_SG_DMA; 1077 int64_t sge_len; 1078 int sg_idx = 0; 1079 int ret; 1080 1081 assert(sg->flags & NVME_SG_ALLOC); 1082 1083 while (len) { 1084 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len; 1085 1086 if (sge_len - offset < 0) { 1087 offset -= sge_len; 1088 sg_idx++; 1089 continue; 1090 } 1091 1092 if (sge_len == offset) { 1093 offset = 0; 1094 sg_idx++; 1095 continue; 1096 } 1097 1098 trans_len = MIN(len, count); 1099 trans_len = MIN(trans_len, sge_len - offset); 1100 1101 if (dma) { 1102 addr = sg->qsg.sg[sg_idx].base + offset; 1103 } else { 1104 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset; 1105 } 1106 1107 if (dir == NVME_TX_DIRECTION_TO_DEVICE) { 1108 ret = nvme_addr_read(n, addr, ptr, trans_len); 1109 } else { 1110 ret = nvme_addr_write(n, addr, ptr, trans_len); 1111 } 1112 1113 if (ret) { 1114 return NVME_DATA_TRAS_ERROR; 1115 } 1116 1117 ptr += trans_len; 1118 len -= trans_len; 1119 count -= trans_len; 1120 offset += trans_len; 1121 1122 if (count == 0) { 1123 count = bytes; 1124 offset += skip_bytes; 1125 } 1126 } 1127 1128 return NVME_SUCCESS; 1129 } 1130 1131 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len, 1132 NvmeTxDirection dir) 1133 { 1134 assert(sg->flags & NVME_SG_ALLOC); 1135 1136 if (sg->flags & NVME_SG_DMA) { 1137 uint64_t residual; 1138 1139 if (dir == NVME_TX_DIRECTION_TO_DEVICE) { 1140 residual = dma_buf_write(ptr, len, &sg->qsg); 1141 } else { 1142 residual = dma_buf_read(ptr, len, &sg->qsg); 1143 } 1144 1145 if (unlikely(residual)) { 1146 trace_pci_nvme_err_invalid_dma(); 1147 return NVME_INVALID_FIELD | NVME_DNR; 1148 } 1149 } else { 1150 size_t bytes; 1151 1152 if (dir == NVME_TX_DIRECTION_TO_DEVICE) { 1153 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len); 1154 } else { 1155 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len); 1156 } 1157 1158 if (unlikely(bytes != len)) { 1159 trace_pci_nvme_err_invalid_dma(); 1160 return NVME_INVALID_FIELD | NVME_DNR; 1161 } 1162 } 1163 1164 return NVME_SUCCESS; 1165 } 1166 1167 static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len, 1168 NvmeRequest *req) 1169 { 1170 uint16_t status; 1171 1172 status = nvme_map_dptr(n, &req->sg, len, &req->cmd); 1173 if (status) { 1174 return status; 1175 } 1176 1177 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE); 1178 } 1179 1180 static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len, 1181 NvmeRequest *req) 1182 { 1183 uint16_t status; 1184 1185 status = nvme_map_dptr(n, &req->sg, len, &req->cmd); 1186 if (status) { 1187 return status; 1188 } 1189 1190 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE); 1191 } 1192 1193 uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len, 1194 NvmeTxDirection dir, NvmeRequest *req) 1195 { 1196 NvmeNamespace *ns = req->ns; 1197 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1198 uint16_t ctrl = le16_to_cpu(rw->control); 1199 1200 if (nvme_ns_ext(ns) && 1201 !(ctrl & NVME_RW_PRINFO_PRACT && ns->lbaf.ms == 8)) { 1202 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz, 1203 ns->lbaf.ms, 0, dir); 1204 } 1205 1206 return nvme_tx(n, &req->sg, ptr, len, dir); 1207 } 1208 1209 uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len, 1210 NvmeTxDirection dir, NvmeRequest *req) 1211 { 1212 NvmeNamespace *ns = req->ns; 1213 uint16_t status; 1214 1215 if (nvme_ns_ext(ns)) { 1216 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms, 1217 ns->lbasz, ns->lbasz, dir); 1218 } 1219 1220 nvme_sg_unmap(&req->sg); 1221 1222 status = nvme_map_mptr(n, &req->sg, len, &req->cmd); 1223 if (status) { 1224 return status; 1225 } 1226 1227 return nvme_tx(n, &req->sg, ptr, len, dir); 1228 } 1229 1230 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset, 1231 BlockCompletionFunc *cb, NvmeRequest *req) 1232 { 1233 assert(req->sg.flags & NVME_SG_ALLOC); 1234 1235 if (req->sg.flags & NVME_SG_DMA) { 1236 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE, 1237 cb, req); 1238 } else { 1239 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req); 1240 } 1241 } 1242 1243 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset, 1244 BlockCompletionFunc *cb, NvmeRequest *req) 1245 { 1246 assert(req->sg.flags & NVME_SG_ALLOC); 1247 1248 if (req->sg.flags & NVME_SG_DMA) { 1249 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE, 1250 cb, req); 1251 } else { 1252 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req); 1253 } 1254 } 1255 1256 static void nvme_post_cqes(void *opaque) 1257 { 1258 NvmeCQueue *cq = opaque; 1259 NvmeCtrl *n = cq->ctrl; 1260 NvmeRequest *req, *next; 1261 int ret; 1262 1263 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) { 1264 NvmeSQueue *sq; 1265 hwaddr addr; 1266 1267 if (nvme_cq_full(cq)) { 1268 break; 1269 } 1270 1271 sq = req->sq; 1272 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase); 1273 req->cqe.sq_id = cpu_to_le16(sq->sqid); 1274 req->cqe.sq_head = cpu_to_le16(sq->head); 1275 addr = cq->dma_addr + cq->tail * n->cqe_size; 1276 ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe, 1277 sizeof(req->cqe)); 1278 if (ret) { 1279 trace_pci_nvme_err_addr_write(addr); 1280 trace_pci_nvme_err_cfs(); 1281 n->bar.csts = NVME_CSTS_FAILED; 1282 break; 1283 } 1284 QTAILQ_REMOVE(&cq->req_list, req, entry); 1285 nvme_inc_cq_tail(cq); 1286 nvme_sg_unmap(&req->sg); 1287 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); 1288 } 1289 if (cq->tail != cq->head) { 1290 nvme_irq_assert(n, cq); 1291 } 1292 } 1293 1294 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) 1295 { 1296 assert(cq->cqid == req->sq->cqid); 1297 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid, 1298 req->status); 1299 1300 if (req->status) { 1301 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns), 1302 req->status, req->cmd.opcode); 1303 } 1304 1305 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); 1306 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); 1307 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); 1308 } 1309 1310 static void nvme_process_aers(void *opaque) 1311 { 1312 NvmeCtrl *n = opaque; 1313 NvmeAsyncEvent *event, *next; 1314 1315 trace_pci_nvme_process_aers(n->aer_queued); 1316 1317 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) { 1318 NvmeRequest *req; 1319 NvmeAerResult *result; 1320 1321 /* can't post cqe if there is nothing to complete */ 1322 if (!n->outstanding_aers) { 1323 trace_pci_nvme_no_outstanding_aers(); 1324 break; 1325 } 1326 1327 /* ignore if masked (cqe posted, but event not cleared) */ 1328 if (n->aer_mask & (1 << event->result.event_type)) { 1329 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask); 1330 continue; 1331 } 1332 1333 QTAILQ_REMOVE(&n->aer_queue, event, entry); 1334 n->aer_queued--; 1335 1336 n->aer_mask |= 1 << event->result.event_type; 1337 n->outstanding_aers--; 1338 1339 req = n->aer_reqs[n->outstanding_aers]; 1340 1341 result = (NvmeAerResult *) &req->cqe.result; 1342 result->event_type = event->result.event_type; 1343 result->event_info = event->result.event_info; 1344 result->log_page = event->result.log_page; 1345 g_free(event); 1346 1347 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info, 1348 result->log_page); 1349 1350 nvme_enqueue_req_completion(&n->admin_cq, req); 1351 } 1352 } 1353 1354 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type, 1355 uint8_t event_info, uint8_t log_page) 1356 { 1357 NvmeAsyncEvent *event; 1358 1359 trace_pci_nvme_enqueue_event(event_type, event_info, log_page); 1360 1361 if (n->aer_queued == n->params.aer_max_queued) { 1362 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued); 1363 return; 1364 } 1365 1366 event = g_new(NvmeAsyncEvent, 1); 1367 event->result = (NvmeAerResult) { 1368 .event_type = event_type, 1369 .event_info = event_info, 1370 .log_page = log_page, 1371 }; 1372 1373 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry); 1374 n->aer_queued++; 1375 1376 nvme_process_aers(n); 1377 } 1378 1379 static void nvme_smart_event(NvmeCtrl *n, uint8_t event) 1380 { 1381 uint8_t aer_info; 1382 1383 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */ 1384 if (!(NVME_AEC_SMART(n->features.async_config) & event)) { 1385 return; 1386 } 1387 1388 switch (event) { 1389 case NVME_SMART_SPARE: 1390 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH; 1391 break; 1392 case NVME_SMART_TEMPERATURE: 1393 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH; 1394 break; 1395 case NVME_SMART_RELIABILITY: 1396 case NVME_SMART_MEDIA_READ_ONLY: 1397 case NVME_SMART_FAILED_VOLATILE_MEDIA: 1398 case NVME_SMART_PMR_UNRELIABLE: 1399 aer_info = NVME_AER_INFO_SMART_RELIABILITY; 1400 break; 1401 default: 1402 return; 1403 } 1404 1405 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO); 1406 } 1407 1408 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type) 1409 { 1410 n->aer_mask &= ~(1 << event_type); 1411 if (!QTAILQ_EMPTY(&n->aer_queue)) { 1412 nvme_process_aers(n); 1413 } 1414 } 1415 1416 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len) 1417 { 1418 uint8_t mdts = n->params.mdts; 1419 1420 if (mdts && len > n->page_size << mdts) { 1421 trace_pci_nvme_err_mdts(len); 1422 return NVME_INVALID_FIELD | NVME_DNR; 1423 } 1424 1425 return NVME_SUCCESS; 1426 } 1427 1428 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba, 1429 uint32_t nlb) 1430 { 1431 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze); 1432 1433 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) { 1434 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze); 1435 return NVME_LBA_RANGE | NVME_DNR; 1436 } 1437 1438 return NVME_SUCCESS; 1439 } 1440 1441 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba, 1442 uint32_t nlb) 1443 { 1444 BlockDriverState *bs = blk_bs(ns->blkconf.blk); 1445 1446 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb); 1447 int64_t offset = nvme_l2b(ns, slba); 1448 bool zeroed; 1449 int ret; 1450 1451 Error *local_err = NULL; 1452 1453 /* 1454 * `pnum` holds the number of bytes after offset that shares the same 1455 * allocation status as the byte at offset. If `pnum` is different from 1456 * `bytes`, we should check the allocation status of the next range and 1457 * continue this until all bytes have been checked. 1458 */ 1459 do { 1460 bytes -= pnum; 1461 1462 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL); 1463 if (ret < 0) { 1464 error_setg_errno(&local_err, -ret, "unable to get block status"); 1465 error_report_err(local_err); 1466 1467 return NVME_INTERNAL_DEV_ERROR; 1468 } 1469 1470 zeroed = !!(ret & BDRV_BLOCK_ZERO); 1471 1472 trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed); 1473 1474 if (zeroed) { 1475 return NVME_DULB; 1476 } 1477 1478 offset += pnum; 1479 } while (pnum != bytes); 1480 1481 return NVME_SUCCESS; 1482 } 1483 1484 static void nvme_aio_err(NvmeRequest *req, int ret) 1485 { 1486 uint16_t status = NVME_SUCCESS; 1487 Error *local_err = NULL; 1488 1489 switch (req->cmd.opcode) { 1490 case NVME_CMD_READ: 1491 status = NVME_UNRECOVERED_READ; 1492 break; 1493 case NVME_CMD_FLUSH: 1494 case NVME_CMD_WRITE: 1495 case NVME_CMD_WRITE_ZEROES: 1496 case NVME_CMD_ZONE_APPEND: 1497 status = NVME_WRITE_FAULT; 1498 break; 1499 default: 1500 status = NVME_INTERNAL_DEV_ERROR; 1501 break; 1502 } 1503 1504 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status); 1505 1506 error_setg_errno(&local_err, -ret, "aio failed"); 1507 error_report_err(local_err); 1508 1509 /* 1510 * Set the command status code to the first encountered error but allow a 1511 * subsequent Internal Device Error to trump it. 1512 */ 1513 if (req->status && status != NVME_INTERNAL_DEV_ERROR) { 1514 return; 1515 } 1516 1517 req->status = status; 1518 } 1519 1520 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba) 1521 { 1522 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 : 1523 slba / ns->zone_size; 1524 } 1525 1526 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba) 1527 { 1528 uint32_t zone_idx = nvme_zone_idx(ns, slba); 1529 1530 assert(zone_idx < ns->num_zones); 1531 return &ns->zone_array[zone_idx]; 1532 } 1533 1534 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone) 1535 { 1536 uint64_t zslba = zone->d.zslba; 1537 1538 switch (nvme_get_zone_state(zone)) { 1539 case NVME_ZONE_STATE_EMPTY: 1540 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1541 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1542 case NVME_ZONE_STATE_CLOSED: 1543 return NVME_SUCCESS; 1544 case NVME_ZONE_STATE_FULL: 1545 trace_pci_nvme_err_zone_is_full(zslba); 1546 return NVME_ZONE_FULL; 1547 case NVME_ZONE_STATE_OFFLINE: 1548 trace_pci_nvme_err_zone_is_offline(zslba); 1549 return NVME_ZONE_OFFLINE; 1550 case NVME_ZONE_STATE_READ_ONLY: 1551 trace_pci_nvme_err_zone_is_read_only(zslba); 1552 return NVME_ZONE_READ_ONLY; 1553 default: 1554 assert(false); 1555 } 1556 1557 return NVME_INTERNAL_DEV_ERROR; 1558 } 1559 1560 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone, 1561 uint64_t slba, uint32_t nlb) 1562 { 1563 uint64_t zcap = nvme_zone_wr_boundary(zone); 1564 uint16_t status; 1565 1566 status = nvme_check_zone_state_for_write(zone); 1567 if (status) { 1568 return status; 1569 } 1570 1571 if (unlikely(slba != zone->w_ptr)) { 1572 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr); 1573 return NVME_ZONE_INVALID_WRITE; 1574 } 1575 1576 if (unlikely((slba + nlb) > zcap)) { 1577 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap); 1578 return NVME_ZONE_BOUNDARY_ERROR; 1579 } 1580 1581 return NVME_SUCCESS; 1582 } 1583 1584 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) 1585 { 1586 switch (nvme_get_zone_state(zone)) { 1587 case NVME_ZONE_STATE_EMPTY: 1588 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1589 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1590 case NVME_ZONE_STATE_FULL: 1591 case NVME_ZONE_STATE_CLOSED: 1592 case NVME_ZONE_STATE_READ_ONLY: 1593 return NVME_SUCCESS; 1594 case NVME_ZONE_STATE_OFFLINE: 1595 trace_pci_nvme_err_zone_is_offline(zone->d.zslba); 1596 return NVME_ZONE_OFFLINE; 1597 default: 1598 assert(false); 1599 } 1600 1601 return NVME_INTERNAL_DEV_ERROR; 1602 } 1603 1604 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, 1605 uint32_t nlb) 1606 { 1607 NvmeZone *zone = nvme_get_zone_by_slba(ns, slba); 1608 uint64_t bndry = nvme_zone_rd_boundary(ns, zone); 1609 uint64_t end = slba + nlb; 1610 uint16_t status; 1611 1612 status = nvme_check_zone_state_for_read(zone); 1613 if (status) { 1614 ; 1615 } else if (unlikely(end > bndry)) { 1616 if (!ns->params.cross_zone_read) { 1617 status = NVME_ZONE_BOUNDARY_ERROR; 1618 } else { 1619 /* 1620 * Read across zone boundary - check that all subsequent 1621 * zones that are being read have an appropriate state. 1622 */ 1623 do { 1624 zone++; 1625 status = nvme_check_zone_state_for_read(zone); 1626 if (status) { 1627 break; 1628 } 1629 } while (end > nvme_zone_rd_boundary(ns, zone)); 1630 } 1631 } 1632 1633 return status; 1634 } 1635 1636 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone) 1637 { 1638 switch (nvme_get_zone_state(zone)) { 1639 case NVME_ZONE_STATE_FULL: 1640 return NVME_SUCCESS; 1641 1642 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1643 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1644 nvme_aor_dec_open(ns); 1645 /* fallthrough */ 1646 case NVME_ZONE_STATE_CLOSED: 1647 nvme_aor_dec_active(ns); 1648 /* fallthrough */ 1649 case NVME_ZONE_STATE_EMPTY: 1650 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); 1651 return NVME_SUCCESS; 1652 1653 default: 1654 return NVME_ZONE_INVAL_TRANSITION; 1655 } 1656 } 1657 1658 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone) 1659 { 1660 switch (nvme_get_zone_state(zone)) { 1661 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1662 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1663 nvme_aor_dec_open(ns); 1664 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); 1665 /* fall through */ 1666 case NVME_ZONE_STATE_CLOSED: 1667 return NVME_SUCCESS; 1668 1669 default: 1670 return NVME_ZONE_INVAL_TRANSITION; 1671 } 1672 } 1673 1674 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns) 1675 { 1676 NvmeZone *zone; 1677 1678 if (ns->params.max_open_zones && 1679 ns->nr_open_zones == ns->params.max_open_zones) { 1680 zone = QTAILQ_FIRST(&ns->imp_open_zones); 1681 if (zone) { 1682 /* 1683 * Automatically close this implicitly open zone. 1684 */ 1685 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); 1686 nvme_zrm_close(ns, zone); 1687 } 1688 } 1689 } 1690 1691 enum { 1692 NVME_ZRM_AUTO = 1 << 0, 1693 }; 1694 1695 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns, 1696 NvmeZone *zone, int flags) 1697 { 1698 int act = 0; 1699 uint16_t status; 1700 1701 switch (nvme_get_zone_state(zone)) { 1702 case NVME_ZONE_STATE_EMPTY: 1703 act = 1; 1704 1705 /* fallthrough */ 1706 1707 case NVME_ZONE_STATE_CLOSED: 1708 if (n->params.auto_transition_zones) { 1709 nvme_zrm_auto_transition_zone(ns); 1710 } 1711 status = nvme_aor_check(ns, act, 1); 1712 if (status) { 1713 return status; 1714 } 1715 1716 if (act) { 1717 nvme_aor_inc_active(ns); 1718 } 1719 1720 nvme_aor_inc_open(ns); 1721 1722 if (flags & NVME_ZRM_AUTO) { 1723 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN); 1724 return NVME_SUCCESS; 1725 } 1726 1727 /* fallthrough */ 1728 1729 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1730 if (flags & NVME_ZRM_AUTO) { 1731 return NVME_SUCCESS; 1732 } 1733 1734 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN); 1735 1736 /* fallthrough */ 1737 1738 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1739 return NVME_SUCCESS; 1740 1741 default: 1742 return NVME_ZONE_INVAL_TRANSITION; 1743 } 1744 } 1745 1746 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns, 1747 NvmeZone *zone) 1748 { 1749 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO); 1750 } 1751 1752 static inline uint16_t nvme_zrm_open(NvmeCtrl *n, NvmeNamespace *ns, 1753 NvmeZone *zone) 1754 { 1755 return nvme_zrm_open_flags(n, ns, zone, 0); 1756 } 1757 1758 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, 1759 uint32_t nlb) 1760 { 1761 zone->d.wp += nlb; 1762 1763 if (zone->d.wp == nvme_zone_wr_boundary(zone)) { 1764 nvme_zrm_finish(ns, zone); 1765 } 1766 } 1767 1768 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req) 1769 { 1770 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1771 NvmeZone *zone; 1772 uint64_t slba; 1773 uint32_t nlb; 1774 1775 slba = le64_to_cpu(rw->slba); 1776 nlb = le16_to_cpu(rw->nlb) + 1; 1777 zone = nvme_get_zone_by_slba(ns, slba); 1778 1779 nvme_advance_zone_wp(ns, zone, nlb); 1780 } 1781 1782 static inline bool nvme_is_write(NvmeRequest *req) 1783 { 1784 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1785 1786 return rw->opcode == NVME_CMD_WRITE || 1787 rw->opcode == NVME_CMD_ZONE_APPEND || 1788 rw->opcode == NVME_CMD_WRITE_ZEROES; 1789 } 1790 1791 static void nvme_misc_cb(void *opaque, int ret) 1792 { 1793 NvmeRequest *req = opaque; 1794 NvmeNamespace *ns = req->ns; 1795 1796 BlockBackend *blk = ns->blkconf.blk; 1797 BlockAcctCookie *acct = &req->acct; 1798 BlockAcctStats *stats = blk_get_stats(blk); 1799 1800 trace_pci_nvme_misc_cb(nvme_cid(req), blk_name(blk)); 1801 1802 if (ret) { 1803 block_acct_failed(stats, acct); 1804 nvme_aio_err(req, ret); 1805 } else { 1806 block_acct_done(stats, acct); 1807 } 1808 1809 nvme_enqueue_req_completion(nvme_cq(req), req); 1810 } 1811 1812 void nvme_rw_complete_cb(void *opaque, int ret) 1813 { 1814 NvmeRequest *req = opaque; 1815 NvmeNamespace *ns = req->ns; 1816 BlockBackend *blk = ns->blkconf.blk; 1817 BlockAcctCookie *acct = &req->acct; 1818 BlockAcctStats *stats = blk_get_stats(blk); 1819 1820 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk)); 1821 1822 if (ret) { 1823 block_acct_failed(stats, acct); 1824 nvme_aio_err(req, ret); 1825 } else { 1826 block_acct_done(stats, acct); 1827 } 1828 1829 if (ns->params.zoned && nvme_is_write(req)) { 1830 nvme_finalize_zoned_write(ns, req); 1831 } 1832 1833 nvme_enqueue_req_completion(nvme_cq(req), req); 1834 } 1835 1836 static void nvme_rw_cb(void *opaque, int ret) 1837 { 1838 NvmeRequest *req = opaque; 1839 NvmeNamespace *ns = req->ns; 1840 1841 BlockBackend *blk = ns->blkconf.blk; 1842 1843 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk)); 1844 1845 if (ret) { 1846 goto out; 1847 } 1848 1849 if (ns->lbaf.ms) { 1850 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1851 uint64_t slba = le64_to_cpu(rw->slba); 1852 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; 1853 uint64_t offset = nvme_moff(ns, slba); 1854 1855 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) { 1856 size_t mlen = nvme_m2b(ns, nlb); 1857 1858 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen, 1859 BDRV_REQ_MAY_UNMAP, 1860 nvme_rw_complete_cb, req); 1861 return; 1862 } 1863 1864 if (nvme_ns_ext(ns) || req->cmd.mptr) { 1865 uint16_t status; 1866 1867 nvme_sg_unmap(&req->sg); 1868 status = nvme_map_mdata(nvme_ctrl(req), nlb, req); 1869 if (status) { 1870 ret = -EFAULT; 1871 goto out; 1872 } 1873 1874 if (req->cmd.opcode == NVME_CMD_READ) { 1875 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req); 1876 } 1877 1878 return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req); 1879 } 1880 } 1881 1882 out: 1883 nvme_rw_complete_cb(req, ret); 1884 } 1885 1886 struct nvme_aio_format_ctx { 1887 NvmeRequest *req; 1888 NvmeNamespace *ns; 1889 1890 /* number of outstanding write zeroes for this namespace */ 1891 int *count; 1892 }; 1893 1894 static void nvme_aio_format_cb(void *opaque, int ret) 1895 { 1896 struct nvme_aio_format_ctx *ctx = opaque; 1897 NvmeRequest *req = ctx->req; 1898 NvmeNamespace *ns = ctx->ns; 1899 uintptr_t *num_formats = (uintptr_t *)&req->opaque; 1900 int *count = ctx->count; 1901 1902 g_free(ctx); 1903 1904 if (ret) { 1905 nvme_aio_err(req, ret); 1906 } 1907 1908 if (--(*count)) { 1909 return; 1910 } 1911 1912 g_free(count); 1913 ns->status = 0x0; 1914 1915 if (--(*num_formats)) { 1916 return; 1917 } 1918 1919 nvme_enqueue_req_completion(nvme_cq(req), req); 1920 } 1921 1922 struct nvme_aio_flush_ctx { 1923 NvmeRequest *req; 1924 NvmeNamespace *ns; 1925 BlockAcctCookie acct; 1926 }; 1927 1928 static void nvme_aio_flush_cb(void *opaque, int ret) 1929 { 1930 struct nvme_aio_flush_ctx *ctx = opaque; 1931 NvmeRequest *req = ctx->req; 1932 uintptr_t *num_flushes = (uintptr_t *)&req->opaque; 1933 1934 BlockBackend *blk = ctx->ns->blkconf.blk; 1935 BlockAcctCookie *acct = &ctx->acct; 1936 BlockAcctStats *stats = blk_get_stats(blk); 1937 1938 trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk)); 1939 1940 if (!ret) { 1941 block_acct_done(stats, acct); 1942 } else { 1943 block_acct_failed(stats, acct); 1944 nvme_aio_err(req, ret); 1945 } 1946 1947 (*num_flushes)--; 1948 g_free(ctx); 1949 1950 if (*num_flushes) { 1951 return; 1952 } 1953 1954 nvme_enqueue_req_completion(nvme_cq(req), req); 1955 } 1956 1957 static void nvme_verify_cb(void *opaque, int ret) 1958 { 1959 NvmeBounceContext *ctx = opaque; 1960 NvmeRequest *req = ctx->req; 1961 NvmeNamespace *ns = req->ns; 1962 BlockBackend *blk = ns->blkconf.blk; 1963 BlockAcctCookie *acct = &req->acct; 1964 BlockAcctStats *stats = blk_get_stats(blk); 1965 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1966 uint64_t slba = le64_to_cpu(rw->slba); 1967 uint16_t ctrl = le16_to_cpu(rw->control); 1968 uint16_t apptag = le16_to_cpu(rw->apptag); 1969 uint16_t appmask = le16_to_cpu(rw->appmask); 1970 uint32_t reftag = le32_to_cpu(rw->reftag); 1971 uint16_t status; 1972 1973 trace_pci_nvme_verify_cb(nvme_cid(req), NVME_RW_PRINFO(ctrl), apptag, 1974 appmask, reftag); 1975 1976 if (ret) { 1977 block_acct_failed(stats, acct); 1978 nvme_aio_err(req, ret); 1979 goto out; 1980 } 1981 1982 block_acct_done(stats, acct); 1983 1984 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 1985 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce, 1986 ctx->mdata.iov.size, slba); 1987 if (status) { 1988 req->status = status; 1989 goto out; 1990 } 1991 1992 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size, 1993 ctx->mdata.bounce, ctx->mdata.iov.size, 1994 ctrl, slba, apptag, appmask, reftag); 1995 } 1996 1997 out: 1998 qemu_iovec_destroy(&ctx->data.iov); 1999 g_free(ctx->data.bounce); 2000 2001 qemu_iovec_destroy(&ctx->mdata.iov); 2002 g_free(ctx->mdata.bounce); 2003 2004 g_free(ctx); 2005 2006 nvme_enqueue_req_completion(nvme_cq(req), req); 2007 } 2008 2009 2010 static void nvme_verify_mdata_in_cb(void *opaque, int ret) 2011 { 2012 NvmeBounceContext *ctx = opaque; 2013 NvmeRequest *req = ctx->req; 2014 NvmeNamespace *ns = req->ns; 2015 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2016 uint64_t slba = le64_to_cpu(rw->slba); 2017 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2018 size_t mlen = nvme_m2b(ns, nlb); 2019 uint64_t offset = nvme_moff(ns, slba); 2020 BlockBackend *blk = ns->blkconf.blk; 2021 2022 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk)); 2023 2024 if (ret) { 2025 goto out; 2026 } 2027 2028 ctx->mdata.bounce = g_malloc(mlen); 2029 2030 qemu_iovec_reset(&ctx->mdata.iov); 2031 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen); 2032 2033 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0, 2034 nvme_verify_cb, ctx); 2035 return; 2036 2037 out: 2038 nvme_verify_cb(ctx, ret); 2039 } 2040 2041 static void nvme_aio_discard_cb(void *opaque, int ret) 2042 { 2043 NvmeRequest *req = opaque; 2044 uintptr_t *discards = (uintptr_t *)&req->opaque; 2045 2046 trace_pci_nvme_aio_discard_cb(nvme_cid(req)); 2047 2048 if (ret) { 2049 nvme_aio_err(req, ret); 2050 } 2051 2052 (*discards)--; 2053 2054 if (*discards) { 2055 return; 2056 } 2057 2058 nvme_enqueue_req_completion(nvme_cq(req), req); 2059 } 2060 2061 struct nvme_zone_reset_ctx { 2062 NvmeRequest *req; 2063 NvmeZone *zone; 2064 }; 2065 2066 static void nvme_aio_zone_reset_complete_cb(void *opaque, int ret) 2067 { 2068 struct nvme_zone_reset_ctx *ctx = opaque; 2069 NvmeRequest *req = ctx->req; 2070 NvmeNamespace *ns = req->ns; 2071 NvmeZone *zone = ctx->zone; 2072 uintptr_t *resets = (uintptr_t *)&req->opaque; 2073 2074 if (ret) { 2075 nvme_aio_err(req, ret); 2076 goto out; 2077 } 2078 2079 switch (nvme_get_zone_state(zone)) { 2080 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 2081 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 2082 nvme_aor_dec_open(ns); 2083 /* fall through */ 2084 case NVME_ZONE_STATE_CLOSED: 2085 nvme_aor_dec_active(ns); 2086 /* fall through */ 2087 case NVME_ZONE_STATE_FULL: 2088 zone->w_ptr = zone->d.zslba; 2089 zone->d.wp = zone->w_ptr; 2090 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY); 2091 /* fall through */ 2092 default: 2093 break; 2094 } 2095 2096 out: 2097 g_free(ctx); 2098 2099 (*resets)--; 2100 2101 if (*resets) { 2102 return; 2103 } 2104 2105 nvme_enqueue_req_completion(nvme_cq(req), req); 2106 } 2107 2108 static void nvme_aio_zone_reset_cb(void *opaque, int ret) 2109 { 2110 struct nvme_zone_reset_ctx *ctx = opaque; 2111 NvmeRequest *req = ctx->req; 2112 NvmeNamespace *ns = req->ns; 2113 NvmeZone *zone = ctx->zone; 2114 2115 trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba); 2116 2117 if (ret) { 2118 goto out; 2119 } 2120 2121 if (ns->lbaf.ms) { 2122 int64_t offset = nvme_moff(ns, zone->d.zslba); 2123 2124 blk_aio_pwrite_zeroes(ns->blkconf.blk, offset, 2125 nvme_m2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP, 2126 nvme_aio_zone_reset_complete_cb, ctx); 2127 return; 2128 } 2129 2130 out: 2131 nvme_aio_zone_reset_complete_cb(opaque, ret); 2132 } 2133 2134 struct nvme_copy_ctx { 2135 int copies; 2136 uint8_t *bounce; 2137 uint8_t *mbounce; 2138 uint32_t nlb; 2139 NvmeCopySourceRange *ranges; 2140 }; 2141 2142 struct nvme_copy_in_ctx { 2143 NvmeRequest *req; 2144 QEMUIOVector iov; 2145 NvmeCopySourceRange *range; 2146 }; 2147 2148 static void nvme_copy_complete_cb(void *opaque, int ret) 2149 { 2150 NvmeRequest *req = opaque; 2151 NvmeNamespace *ns = req->ns; 2152 struct nvme_copy_ctx *ctx = req->opaque; 2153 2154 if (ret) { 2155 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); 2156 nvme_aio_err(req, ret); 2157 goto out; 2158 } 2159 2160 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); 2161 2162 out: 2163 if (ns->params.zoned) { 2164 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; 2165 uint64_t sdlba = le64_to_cpu(copy->sdlba); 2166 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba); 2167 2168 nvme_advance_zone_wp(ns, zone, ctx->nlb); 2169 } 2170 2171 g_free(ctx->bounce); 2172 g_free(ctx->mbounce); 2173 g_free(ctx); 2174 2175 nvme_enqueue_req_completion(nvme_cq(req), req); 2176 } 2177 2178 static void nvme_copy_cb(void *opaque, int ret) 2179 { 2180 NvmeRequest *req = opaque; 2181 NvmeNamespace *ns = req->ns; 2182 struct nvme_copy_ctx *ctx = req->opaque; 2183 2184 trace_pci_nvme_copy_cb(nvme_cid(req)); 2185 2186 if (ret) { 2187 goto out; 2188 } 2189 2190 if (ns->lbaf.ms) { 2191 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; 2192 uint64_t sdlba = le64_to_cpu(copy->sdlba); 2193 int64_t offset = nvme_moff(ns, sdlba); 2194 2195 qemu_iovec_reset(&req->sg.iov); 2196 qemu_iovec_add(&req->sg.iov, ctx->mbounce, nvme_m2b(ns, ctx->nlb)); 2197 2198 req->aiocb = blk_aio_pwritev(ns->blkconf.blk, offset, &req->sg.iov, 0, 2199 nvme_copy_complete_cb, req); 2200 return; 2201 } 2202 2203 out: 2204 nvme_copy_complete_cb(opaque, ret); 2205 } 2206 2207 static void nvme_copy_in_complete(NvmeRequest *req) 2208 { 2209 NvmeNamespace *ns = req->ns; 2210 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; 2211 struct nvme_copy_ctx *ctx = req->opaque; 2212 uint64_t sdlba = le64_to_cpu(copy->sdlba); 2213 uint16_t status; 2214 2215 trace_pci_nvme_copy_in_complete(nvme_cid(req)); 2216 2217 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); 2218 2219 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2220 uint16_t prinfor = (copy->control[0] >> 4) & 0xf; 2221 uint16_t prinfow = (copy->control[2] >> 2) & 0xf; 2222 uint16_t nr = copy->nr + 1; 2223 NvmeCopySourceRange *range; 2224 uint64_t slba; 2225 uint32_t nlb; 2226 uint16_t apptag, appmask; 2227 uint32_t reftag; 2228 uint8_t *buf = ctx->bounce, *mbuf = ctx->mbounce; 2229 size_t len, mlen; 2230 int i; 2231 2232 /* 2233 * The dif helpers expects prinfo to be similar to the control field of 2234 * the NvmeRwCmd, so shift by 10 to fake it. 2235 */ 2236 prinfor = prinfor << 10; 2237 prinfow = prinfow << 10; 2238 2239 for (i = 0; i < nr; i++) { 2240 range = &ctx->ranges[i]; 2241 slba = le64_to_cpu(range->slba); 2242 nlb = le16_to_cpu(range->nlb) + 1; 2243 len = nvme_l2b(ns, nlb); 2244 mlen = nvme_m2b(ns, nlb); 2245 apptag = le16_to_cpu(range->apptag); 2246 appmask = le16_to_cpu(range->appmask); 2247 reftag = le32_to_cpu(range->reftag); 2248 2249 status = nvme_dif_check(ns, buf, len, mbuf, mlen, prinfor, slba, 2250 apptag, appmask, reftag); 2251 if (status) { 2252 goto invalid; 2253 } 2254 2255 buf += len; 2256 mbuf += mlen; 2257 } 2258 2259 apptag = le16_to_cpu(copy->apptag); 2260 appmask = le16_to_cpu(copy->appmask); 2261 reftag = le32_to_cpu(copy->reftag); 2262 2263 if (prinfow & NVME_RW_PRINFO_PRACT) { 2264 size_t len = nvme_l2b(ns, ctx->nlb); 2265 size_t mlen = nvme_m2b(ns, ctx->nlb); 2266 2267 status = nvme_check_prinfo(ns, prinfow, sdlba, reftag); 2268 if (status) { 2269 goto invalid; 2270 } 2271 2272 nvme_dif_pract_generate_dif(ns, ctx->bounce, len, ctx->mbounce, 2273 mlen, apptag, reftag); 2274 } else { 2275 status = nvme_dif_check(ns, ctx->bounce, len, ctx->mbounce, mlen, 2276 prinfow, sdlba, apptag, appmask, reftag); 2277 if (status) { 2278 goto invalid; 2279 } 2280 } 2281 } 2282 2283 status = nvme_check_bounds(ns, sdlba, ctx->nlb); 2284 if (status) { 2285 goto invalid; 2286 } 2287 2288 if (ns->params.zoned) { 2289 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba); 2290 2291 status = nvme_check_zone_write(ns, zone, sdlba, ctx->nlb); 2292 if (status) { 2293 goto invalid; 2294 } 2295 2296 status = nvme_zrm_auto(nvme_ctrl(req), ns, zone); 2297 if (status) { 2298 goto invalid; 2299 } 2300 2301 zone->w_ptr += ctx->nlb; 2302 } 2303 2304 qemu_iovec_init(&req->sg.iov, 1); 2305 qemu_iovec_add(&req->sg.iov, ctx->bounce, nvme_l2b(ns, ctx->nlb)); 2306 2307 block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0, 2308 BLOCK_ACCT_WRITE); 2309 2310 req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba), 2311 &req->sg.iov, 0, nvme_copy_cb, req); 2312 2313 return; 2314 2315 invalid: 2316 req->status = status; 2317 2318 g_free(ctx->bounce); 2319 g_free(ctx); 2320 2321 nvme_enqueue_req_completion(nvme_cq(req), req); 2322 } 2323 2324 static void nvme_aio_copy_in_cb(void *opaque, int ret) 2325 { 2326 struct nvme_copy_in_ctx *in_ctx = opaque; 2327 NvmeRequest *req = in_ctx->req; 2328 NvmeNamespace *ns = req->ns; 2329 struct nvme_copy_ctx *ctx = req->opaque; 2330 2331 qemu_iovec_destroy(&in_ctx->iov); 2332 g_free(in_ctx); 2333 2334 trace_pci_nvme_aio_copy_in_cb(nvme_cid(req)); 2335 2336 if (ret) { 2337 nvme_aio_err(req, ret); 2338 } 2339 2340 ctx->copies--; 2341 2342 if (ctx->copies) { 2343 return; 2344 } 2345 2346 if (req->status) { 2347 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); 2348 2349 g_free(ctx->bounce); 2350 g_free(ctx->mbounce); 2351 g_free(ctx); 2352 2353 nvme_enqueue_req_completion(nvme_cq(req), req); 2354 2355 return; 2356 } 2357 2358 nvme_copy_in_complete(req); 2359 } 2360 2361 struct nvme_compare_ctx { 2362 struct { 2363 QEMUIOVector iov; 2364 uint8_t *bounce; 2365 } data; 2366 2367 struct { 2368 QEMUIOVector iov; 2369 uint8_t *bounce; 2370 } mdata; 2371 }; 2372 2373 static void nvme_compare_mdata_cb(void *opaque, int ret) 2374 { 2375 NvmeRequest *req = opaque; 2376 NvmeNamespace *ns = req->ns; 2377 NvmeCtrl *n = nvme_ctrl(req); 2378 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2379 uint16_t ctrl = le16_to_cpu(rw->control); 2380 uint16_t apptag = le16_to_cpu(rw->apptag); 2381 uint16_t appmask = le16_to_cpu(rw->appmask); 2382 uint32_t reftag = le32_to_cpu(rw->reftag); 2383 struct nvme_compare_ctx *ctx = req->opaque; 2384 g_autofree uint8_t *buf = NULL; 2385 BlockBackend *blk = ns->blkconf.blk; 2386 BlockAcctCookie *acct = &req->acct; 2387 BlockAcctStats *stats = blk_get_stats(blk); 2388 uint16_t status = NVME_SUCCESS; 2389 2390 trace_pci_nvme_compare_mdata_cb(nvme_cid(req)); 2391 2392 if (ret) { 2393 block_acct_failed(stats, acct); 2394 nvme_aio_err(req, ret); 2395 goto out; 2396 } 2397 2398 buf = g_malloc(ctx->mdata.iov.size); 2399 2400 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size, 2401 NVME_TX_DIRECTION_TO_DEVICE, req); 2402 if (status) { 2403 req->status = status; 2404 goto out; 2405 } 2406 2407 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2408 uint64_t slba = le64_to_cpu(rw->slba); 2409 uint8_t *bufp; 2410 uint8_t *mbufp = ctx->mdata.bounce; 2411 uint8_t *end = mbufp + ctx->mdata.iov.size; 2412 int16_t pil = 0; 2413 2414 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size, 2415 ctx->mdata.bounce, ctx->mdata.iov.size, ctrl, 2416 slba, apptag, appmask, reftag); 2417 if (status) { 2418 req->status = status; 2419 goto out; 2420 } 2421 2422 /* 2423 * When formatted with protection information, do not compare the DIF 2424 * tuple. 2425 */ 2426 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) { 2427 pil = ns->lbaf.ms - sizeof(NvmeDifTuple); 2428 } 2429 2430 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) { 2431 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) { 2432 req->status = NVME_CMP_FAILURE; 2433 goto out; 2434 } 2435 } 2436 2437 goto out; 2438 } 2439 2440 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) { 2441 req->status = NVME_CMP_FAILURE; 2442 goto out; 2443 } 2444 2445 block_acct_done(stats, acct); 2446 2447 out: 2448 qemu_iovec_destroy(&ctx->data.iov); 2449 g_free(ctx->data.bounce); 2450 2451 qemu_iovec_destroy(&ctx->mdata.iov); 2452 g_free(ctx->mdata.bounce); 2453 2454 g_free(ctx); 2455 2456 nvme_enqueue_req_completion(nvme_cq(req), req); 2457 } 2458 2459 static void nvme_compare_data_cb(void *opaque, int ret) 2460 { 2461 NvmeRequest *req = opaque; 2462 NvmeCtrl *n = nvme_ctrl(req); 2463 NvmeNamespace *ns = req->ns; 2464 BlockBackend *blk = ns->blkconf.blk; 2465 BlockAcctCookie *acct = &req->acct; 2466 BlockAcctStats *stats = blk_get_stats(blk); 2467 2468 struct nvme_compare_ctx *ctx = req->opaque; 2469 g_autofree uint8_t *buf = NULL; 2470 uint16_t status; 2471 2472 trace_pci_nvme_compare_data_cb(nvme_cid(req)); 2473 2474 if (ret) { 2475 block_acct_failed(stats, acct); 2476 nvme_aio_err(req, ret); 2477 goto out; 2478 } 2479 2480 buf = g_malloc(ctx->data.iov.size); 2481 2482 status = nvme_bounce_data(n, buf, ctx->data.iov.size, 2483 NVME_TX_DIRECTION_TO_DEVICE, req); 2484 if (status) { 2485 req->status = status; 2486 goto out; 2487 } 2488 2489 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) { 2490 req->status = NVME_CMP_FAILURE; 2491 goto out; 2492 } 2493 2494 if (ns->lbaf.ms) { 2495 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2496 uint64_t slba = le64_to_cpu(rw->slba); 2497 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2498 size_t mlen = nvme_m2b(ns, nlb); 2499 uint64_t offset = nvme_moff(ns, slba); 2500 2501 ctx->mdata.bounce = g_malloc(mlen); 2502 2503 qemu_iovec_init(&ctx->mdata.iov, 1); 2504 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen); 2505 2506 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0, 2507 nvme_compare_mdata_cb, req); 2508 return; 2509 } 2510 2511 block_acct_done(stats, acct); 2512 2513 out: 2514 qemu_iovec_destroy(&ctx->data.iov); 2515 g_free(ctx->data.bounce); 2516 g_free(ctx); 2517 2518 nvme_enqueue_req_completion(nvme_cq(req), req); 2519 } 2520 2521 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) 2522 { 2523 NvmeNamespace *ns = req->ns; 2524 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; 2525 2526 uint32_t attr = le32_to_cpu(dsm->attributes); 2527 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; 2528 2529 uint16_t status = NVME_SUCCESS; 2530 2531 trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr); 2532 2533 if (attr & NVME_DSMGMT_AD) { 2534 int64_t offset; 2535 size_t len; 2536 NvmeDsmRange range[nr]; 2537 uintptr_t *discards = (uintptr_t *)&req->opaque; 2538 2539 status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req); 2540 if (status) { 2541 return status; 2542 } 2543 2544 /* 2545 * AIO callbacks may be called immediately, so initialize discards to 1 2546 * to make sure the the callback does not complete the request before 2547 * all discards have been issued. 2548 */ 2549 *discards = 1; 2550 2551 for (int i = 0; i < nr; i++) { 2552 uint64_t slba = le64_to_cpu(range[i].slba); 2553 uint32_t nlb = le32_to_cpu(range[i].nlb); 2554 2555 if (nvme_check_bounds(ns, slba, nlb)) { 2556 continue; 2557 } 2558 2559 trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba, 2560 nlb); 2561 2562 if (nlb > n->dmrsl) { 2563 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl); 2564 } 2565 2566 offset = nvme_l2b(ns, slba); 2567 len = nvme_l2b(ns, nlb); 2568 2569 while (len) { 2570 size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len); 2571 2572 (*discards)++; 2573 2574 blk_aio_pdiscard(ns->blkconf.blk, offset, bytes, 2575 nvme_aio_discard_cb, req); 2576 2577 offset += bytes; 2578 len -= bytes; 2579 } 2580 } 2581 2582 /* account for the 1-initialization */ 2583 (*discards)--; 2584 2585 if (*discards) { 2586 status = NVME_NO_COMPLETE; 2587 } else { 2588 status = req->status; 2589 } 2590 } 2591 2592 return status; 2593 } 2594 2595 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req) 2596 { 2597 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2598 NvmeNamespace *ns = req->ns; 2599 BlockBackend *blk = ns->blkconf.blk; 2600 uint64_t slba = le64_to_cpu(rw->slba); 2601 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2602 size_t len = nvme_l2b(ns, nlb); 2603 int64_t offset = nvme_l2b(ns, slba); 2604 uint16_t ctrl = le16_to_cpu(rw->control); 2605 uint32_t reftag = le32_to_cpu(rw->reftag); 2606 NvmeBounceContext *ctx = NULL; 2607 uint16_t status; 2608 2609 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb); 2610 2611 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2612 status = nvme_check_prinfo(ns, ctrl, slba, reftag); 2613 if (status) { 2614 return status; 2615 } 2616 2617 if (ctrl & NVME_RW_PRINFO_PRACT) { 2618 return NVME_INVALID_PROT_INFO | NVME_DNR; 2619 } 2620 } 2621 2622 if (len > n->page_size << n->params.vsl) { 2623 return NVME_INVALID_FIELD | NVME_DNR; 2624 } 2625 2626 status = nvme_check_bounds(ns, slba, nlb); 2627 if (status) { 2628 return status; 2629 } 2630 2631 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 2632 status = nvme_check_dulbe(ns, slba, nlb); 2633 if (status) { 2634 return status; 2635 } 2636 } 2637 2638 ctx = g_new0(NvmeBounceContext, 1); 2639 ctx->req = req; 2640 2641 ctx->data.bounce = g_malloc(len); 2642 2643 qemu_iovec_init(&ctx->data.iov, 1); 2644 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len); 2645 2646 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size, 2647 BLOCK_ACCT_READ); 2648 2649 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0, 2650 nvme_verify_mdata_in_cb, ctx); 2651 return NVME_NO_COMPLETE; 2652 } 2653 2654 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req) 2655 { 2656 NvmeNamespace *ns = req->ns; 2657 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; 2658 2659 uint16_t nr = copy->nr + 1; 2660 uint8_t format = copy->control[0] & 0xf; 2661 2662 /* 2663 * Shift the PRINFOR/PRINFOW values by 10 to allow reusing the 2664 * NVME_RW_PRINFO constants. 2665 */ 2666 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf) << 10; 2667 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf) << 10; 2668 2669 uint32_t nlb = 0; 2670 uint8_t *bounce = NULL, *bouncep = NULL; 2671 uint8_t *mbounce = NULL, *mbouncep = NULL; 2672 struct nvme_copy_ctx *ctx; 2673 uint16_t status; 2674 int i; 2675 2676 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format); 2677 2678 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && 2679 ((prinfor & NVME_RW_PRINFO_PRACT) != (prinfow & NVME_RW_PRINFO_PRACT))) { 2680 return NVME_INVALID_FIELD | NVME_DNR; 2681 } 2682 2683 if (!(n->id_ctrl.ocfs & (1 << format))) { 2684 trace_pci_nvme_err_copy_invalid_format(format); 2685 return NVME_INVALID_FIELD | NVME_DNR; 2686 } 2687 2688 if (nr > ns->id_ns.msrc + 1) { 2689 return NVME_CMD_SIZE_LIMIT | NVME_DNR; 2690 } 2691 2692 ctx = g_new(struct nvme_copy_ctx, 1); 2693 ctx->ranges = g_new(NvmeCopySourceRange, nr); 2694 2695 status = nvme_h2c(n, (uint8_t *)ctx->ranges, 2696 nr * sizeof(NvmeCopySourceRange), req); 2697 if (status) { 2698 goto out; 2699 } 2700 2701 for (i = 0; i < nr; i++) { 2702 uint64_t slba = le64_to_cpu(ctx->ranges[i].slba); 2703 uint32_t _nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1; 2704 2705 if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) { 2706 status = NVME_CMD_SIZE_LIMIT | NVME_DNR; 2707 goto out; 2708 } 2709 2710 status = nvme_check_bounds(ns, slba, _nlb); 2711 if (status) { 2712 goto out; 2713 } 2714 2715 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 2716 status = nvme_check_dulbe(ns, slba, _nlb); 2717 if (status) { 2718 goto out; 2719 } 2720 } 2721 2722 if (ns->params.zoned) { 2723 status = nvme_check_zone_read(ns, slba, _nlb); 2724 if (status) { 2725 goto out; 2726 } 2727 } 2728 2729 nlb += _nlb; 2730 } 2731 2732 if (nlb > le32_to_cpu(ns->id_ns.mcl)) { 2733 status = NVME_CMD_SIZE_LIMIT | NVME_DNR; 2734 goto out; 2735 } 2736 2737 bounce = bouncep = g_malloc(nvme_l2b(ns, nlb)); 2738 if (ns->lbaf.ms) { 2739 mbounce = mbouncep = g_malloc(nvme_m2b(ns, nlb)); 2740 } 2741 2742 block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0, 2743 BLOCK_ACCT_READ); 2744 2745 ctx->bounce = bounce; 2746 ctx->mbounce = mbounce; 2747 ctx->nlb = nlb; 2748 ctx->copies = 1; 2749 2750 req->opaque = ctx; 2751 2752 for (i = 0; i < nr; i++) { 2753 uint64_t slba = le64_to_cpu(ctx->ranges[i].slba); 2754 uint32_t nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1; 2755 2756 size_t len = nvme_l2b(ns, nlb); 2757 int64_t offset = nvme_l2b(ns, slba); 2758 2759 trace_pci_nvme_copy_source_range(slba, nlb); 2760 2761 struct nvme_copy_in_ctx *in_ctx = g_new(struct nvme_copy_in_ctx, 1); 2762 in_ctx->req = req; 2763 2764 qemu_iovec_init(&in_ctx->iov, 1); 2765 qemu_iovec_add(&in_ctx->iov, bouncep, len); 2766 2767 ctx->copies++; 2768 2769 blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0, 2770 nvme_aio_copy_in_cb, in_ctx); 2771 2772 bouncep += len; 2773 2774 if (ns->lbaf.ms) { 2775 len = nvme_m2b(ns, nlb); 2776 offset = nvme_moff(ns, slba); 2777 2778 in_ctx = g_new(struct nvme_copy_in_ctx, 1); 2779 in_ctx->req = req; 2780 2781 qemu_iovec_init(&in_ctx->iov, 1); 2782 qemu_iovec_add(&in_ctx->iov, mbouncep, len); 2783 2784 ctx->copies++; 2785 2786 blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0, 2787 nvme_aio_copy_in_cb, in_ctx); 2788 2789 mbouncep += len; 2790 } 2791 } 2792 2793 /* account for the 1-initialization */ 2794 ctx->copies--; 2795 2796 if (!ctx->copies) { 2797 nvme_copy_in_complete(req); 2798 } 2799 2800 return NVME_NO_COMPLETE; 2801 2802 out: 2803 g_free(ctx->ranges); 2804 g_free(ctx); 2805 2806 return status; 2807 } 2808 2809 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) 2810 { 2811 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2812 NvmeNamespace *ns = req->ns; 2813 BlockBackend *blk = ns->blkconf.blk; 2814 uint64_t slba = le64_to_cpu(rw->slba); 2815 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2816 uint16_t ctrl = le16_to_cpu(rw->control); 2817 size_t data_len = nvme_l2b(ns, nlb); 2818 size_t len = data_len; 2819 int64_t offset = nvme_l2b(ns, slba); 2820 struct nvme_compare_ctx *ctx = NULL; 2821 uint16_t status; 2822 2823 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb); 2824 2825 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (ctrl & NVME_RW_PRINFO_PRACT)) { 2826 return NVME_INVALID_PROT_INFO | NVME_DNR; 2827 } 2828 2829 if (nvme_ns_ext(ns)) { 2830 len += nvme_m2b(ns, nlb); 2831 } 2832 2833 status = nvme_check_mdts(n, len); 2834 if (status) { 2835 return status; 2836 } 2837 2838 status = nvme_check_bounds(ns, slba, nlb); 2839 if (status) { 2840 return status; 2841 } 2842 2843 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 2844 status = nvme_check_dulbe(ns, slba, nlb); 2845 if (status) { 2846 return status; 2847 } 2848 } 2849 2850 status = nvme_map_dptr(n, &req->sg, len, &req->cmd); 2851 if (status) { 2852 return status; 2853 } 2854 2855 ctx = g_new(struct nvme_compare_ctx, 1); 2856 ctx->data.bounce = g_malloc(data_len); 2857 2858 req->opaque = ctx; 2859 2860 qemu_iovec_init(&ctx->data.iov, 1); 2861 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len); 2862 2863 block_acct_start(blk_get_stats(blk), &req->acct, data_len, 2864 BLOCK_ACCT_READ); 2865 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0, 2866 nvme_compare_data_cb, req); 2867 2868 return NVME_NO_COMPLETE; 2869 } 2870 2871 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) 2872 { 2873 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 2874 uintptr_t *num_flushes = (uintptr_t *)&req->opaque; 2875 uint16_t status; 2876 struct nvme_aio_flush_ctx *ctx; 2877 NvmeNamespace *ns; 2878 2879 trace_pci_nvme_flush(nvme_cid(req), nsid); 2880 2881 if (nsid != NVME_NSID_BROADCAST) { 2882 req->ns = nvme_ns(n, nsid); 2883 if (unlikely(!req->ns)) { 2884 return NVME_INVALID_FIELD | NVME_DNR; 2885 } 2886 2887 block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, 2888 BLOCK_ACCT_FLUSH); 2889 req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_misc_cb, req); 2890 return NVME_NO_COMPLETE; 2891 } 2892 2893 /* 1-initialize; see comment in nvme_dsm */ 2894 *num_flushes = 1; 2895 2896 for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) { 2897 ns = nvme_ns(n, i); 2898 if (!ns) { 2899 continue; 2900 } 2901 2902 ctx = g_new(struct nvme_aio_flush_ctx, 1); 2903 ctx->req = req; 2904 ctx->ns = ns; 2905 2906 (*num_flushes)++; 2907 2908 block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0, 2909 BLOCK_ACCT_FLUSH); 2910 blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx); 2911 } 2912 2913 /* account for the 1-initialization */ 2914 (*num_flushes)--; 2915 2916 if (*num_flushes) { 2917 status = NVME_NO_COMPLETE; 2918 } else { 2919 status = req->status; 2920 } 2921 2922 return status; 2923 } 2924 2925 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) 2926 { 2927 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2928 NvmeNamespace *ns = req->ns; 2929 uint64_t slba = le64_to_cpu(rw->slba); 2930 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; 2931 uint16_t ctrl = le16_to_cpu(rw->control); 2932 uint64_t data_size = nvme_l2b(ns, nlb); 2933 uint64_t mapped_size = data_size; 2934 uint64_t data_offset; 2935 BlockBackend *blk = ns->blkconf.blk; 2936 uint16_t status; 2937 2938 if (nvme_ns_ext(ns)) { 2939 mapped_size += nvme_m2b(ns, nlb); 2940 2941 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2942 bool pract = ctrl & NVME_RW_PRINFO_PRACT; 2943 2944 if (pract && ns->lbaf.ms == 8) { 2945 mapped_size = data_size; 2946 } 2947 } 2948 } 2949 2950 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba); 2951 2952 status = nvme_check_mdts(n, mapped_size); 2953 if (status) { 2954 goto invalid; 2955 } 2956 2957 status = nvme_check_bounds(ns, slba, nlb); 2958 if (status) { 2959 goto invalid; 2960 } 2961 2962 if (ns->params.zoned) { 2963 status = nvme_check_zone_read(ns, slba, nlb); 2964 if (status) { 2965 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status); 2966 goto invalid; 2967 } 2968 } 2969 2970 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 2971 status = nvme_check_dulbe(ns, slba, nlb); 2972 if (status) { 2973 goto invalid; 2974 } 2975 } 2976 2977 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2978 return nvme_dif_rw(n, req); 2979 } 2980 2981 status = nvme_map_data(n, nlb, req); 2982 if (status) { 2983 goto invalid; 2984 } 2985 2986 data_offset = nvme_l2b(ns, slba); 2987 2988 block_acct_start(blk_get_stats(blk), &req->acct, data_size, 2989 BLOCK_ACCT_READ); 2990 nvme_blk_read(blk, data_offset, nvme_rw_cb, req); 2991 return NVME_NO_COMPLETE; 2992 2993 invalid: 2994 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ); 2995 return status | NVME_DNR; 2996 } 2997 2998 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, 2999 bool wrz) 3000 { 3001 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 3002 NvmeNamespace *ns = req->ns; 3003 uint64_t slba = le64_to_cpu(rw->slba); 3004 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; 3005 uint16_t ctrl = le16_to_cpu(rw->control); 3006 uint64_t data_size = nvme_l2b(ns, nlb); 3007 uint64_t mapped_size = data_size; 3008 uint64_t data_offset; 3009 NvmeZone *zone; 3010 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe; 3011 BlockBackend *blk = ns->blkconf.blk; 3012 uint16_t status; 3013 3014 if (nvme_ns_ext(ns)) { 3015 mapped_size += nvme_m2b(ns, nlb); 3016 3017 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3018 bool pract = ctrl & NVME_RW_PRINFO_PRACT; 3019 3020 if (pract && ns->lbaf.ms == 8) { 3021 mapped_size -= nvme_m2b(ns, nlb); 3022 } 3023 } 3024 } 3025 3026 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode), 3027 nvme_nsid(ns), nlb, mapped_size, slba); 3028 3029 if (!wrz) { 3030 status = nvme_check_mdts(n, mapped_size); 3031 if (status) { 3032 goto invalid; 3033 } 3034 } 3035 3036 status = nvme_check_bounds(ns, slba, nlb); 3037 if (status) { 3038 goto invalid; 3039 } 3040 3041 if (ns->params.zoned) { 3042 zone = nvme_get_zone_by_slba(ns, slba); 3043 3044 if (append) { 3045 bool piremap = !!(ctrl & NVME_RW_PIREMAP); 3046 3047 if (unlikely(slba != zone->d.zslba)) { 3048 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba); 3049 status = NVME_INVALID_FIELD; 3050 goto invalid; 3051 } 3052 3053 if (n->params.zasl && 3054 data_size > (uint64_t)n->page_size << n->params.zasl) { 3055 trace_pci_nvme_err_zasl(data_size); 3056 return NVME_INVALID_FIELD | NVME_DNR; 3057 } 3058 3059 slba = zone->w_ptr; 3060 rw->slba = cpu_to_le64(slba); 3061 res->slba = cpu_to_le64(slba); 3062 3063 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3064 case NVME_ID_NS_DPS_TYPE_1: 3065 if (!piremap) { 3066 return NVME_INVALID_PROT_INFO | NVME_DNR; 3067 } 3068 3069 /* fallthrough */ 3070 3071 case NVME_ID_NS_DPS_TYPE_2: 3072 if (piremap) { 3073 uint32_t reftag = le32_to_cpu(rw->reftag); 3074 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba)); 3075 } 3076 3077 break; 3078 3079 case NVME_ID_NS_DPS_TYPE_3: 3080 if (piremap) { 3081 return NVME_INVALID_PROT_INFO | NVME_DNR; 3082 } 3083 3084 break; 3085 } 3086 } 3087 3088 status = nvme_check_zone_write(ns, zone, slba, nlb); 3089 if (status) { 3090 goto invalid; 3091 } 3092 3093 status = nvme_zrm_auto(n, ns, zone); 3094 if (status) { 3095 goto invalid; 3096 } 3097 3098 zone->w_ptr += nlb; 3099 } 3100 3101 data_offset = nvme_l2b(ns, slba); 3102 3103 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3104 return nvme_dif_rw(n, req); 3105 } 3106 3107 if (!wrz) { 3108 status = nvme_map_data(n, nlb, req); 3109 if (status) { 3110 goto invalid; 3111 } 3112 3113 block_acct_start(blk_get_stats(blk), &req->acct, data_size, 3114 BLOCK_ACCT_WRITE); 3115 nvme_blk_write(blk, data_offset, nvme_rw_cb, req); 3116 } else { 3117 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size, 3118 BDRV_REQ_MAY_UNMAP, nvme_rw_cb, 3119 req); 3120 } 3121 3122 return NVME_NO_COMPLETE; 3123 3124 invalid: 3125 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE); 3126 return status | NVME_DNR; 3127 } 3128 3129 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req) 3130 { 3131 return nvme_do_write(n, req, false, false); 3132 } 3133 3134 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) 3135 { 3136 return nvme_do_write(n, req, false, true); 3137 } 3138 3139 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req) 3140 { 3141 return nvme_do_write(n, req, true, false); 3142 } 3143 3144 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c, 3145 uint64_t *slba, uint32_t *zone_idx) 3146 { 3147 uint32_t dw10 = le32_to_cpu(c->cdw10); 3148 uint32_t dw11 = le32_to_cpu(c->cdw11); 3149 3150 if (!ns->params.zoned) { 3151 trace_pci_nvme_err_invalid_opc(c->opcode); 3152 return NVME_INVALID_OPCODE | NVME_DNR; 3153 } 3154 3155 *slba = ((uint64_t)dw11) << 32 | dw10; 3156 if (unlikely(*slba >= ns->id_ns.nsze)) { 3157 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze); 3158 *slba = 0; 3159 return NVME_LBA_RANGE | NVME_DNR; 3160 } 3161 3162 *zone_idx = nvme_zone_idx(ns, *slba); 3163 assert(*zone_idx < ns->num_zones); 3164 3165 return NVME_SUCCESS; 3166 } 3167 3168 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState, 3169 NvmeRequest *); 3170 3171 enum NvmeZoneProcessingMask { 3172 NVME_PROC_CURRENT_ZONE = 0, 3173 NVME_PROC_OPENED_ZONES = 1 << 0, 3174 NVME_PROC_CLOSED_ZONES = 1 << 1, 3175 NVME_PROC_READ_ONLY_ZONES = 1 << 2, 3176 NVME_PROC_FULL_ZONES = 1 << 3, 3177 }; 3178 3179 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, 3180 NvmeZoneState state, NvmeRequest *req) 3181 { 3182 return nvme_zrm_open(nvme_ctrl(req), ns, zone); 3183 } 3184 3185 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, 3186 NvmeZoneState state, NvmeRequest *req) 3187 { 3188 return nvme_zrm_close(ns, zone); 3189 } 3190 3191 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, 3192 NvmeZoneState state, NvmeRequest *req) 3193 { 3194 return nvme_zrm_finish(ns, zone); 3195 } 3196 3197 static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone, 3198 NvmeZoneState state, NvmeRequest *req) 3199 { 3200 uintptr_t *resets = (uintptr_t *)&req->opaque; 3201 struct nvme_zone_reset_ctx *ctx; 3202 3203 switch (state) { 3204 case NVME_ZONE_STATE_EMPTY: 3205 return NVME_SUCCESS; 3206 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 3207 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 3208 case NVME_ZONE_STATE_CLOSED: 3209 case NVME_ZONE_STATE_FULL: 3210 break; 3211 default: 3212 return NVME_ZONE_INVAL_TRANSITION; 3213 } 3214 3215 /* 3216 * The zone reset aio callback needs to know the zone that is being reset 3217 * in order to transition the zone on completion. 3218 */ 3219 ctx = g_new(struct nvme_zone_reset_ctx, 1); 3220 ctx->req = req; 3221 ctx->zone = zone; 3222 3223 (*resets)++; 3224 3225 blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba), 3226 nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP, 3227 nvme_aio_zone_reset_cb, ctx); 3228 3229 return NVME_NO_COMPLETE; 3230 } 3231 3232 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone, 3233 NvmeZoneState state, NvmeRequest *req) 3234 { 3235 switch (state) { 3236 case NVME_ZONE_STATE_READ_ONLY: 3237 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE); 3238 /* fall through */ 3239 case NVME_ZONE_STATE_OFFLINE: 3240 return NVME_SUCCESS; 3241 default: 3242 return NVME_ZONE_INVAL_TRANSITION; 3243 } 3244 } 3245 3246 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone) 3247 { 3248 uint16_t status; 3249 uint8_t state = nvme_get_zone_state(zone); 3250 3251 if (state == NVME_ZONE_STATE_EMPTY) { 3252 status = nvme_aor_check(ns, 1, 0); 3253 if (status) { 3254 return status; 3255 } 3256 nvme_aor_inc_active(ns); 3257 zone->d.za |= NVME_ZA_ZD_EXT_VALID; 3258 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); 3259 return NVME_SUCCESS; 3260 } 3261 3262 return NVME_ZONE_INVAL_TRANSITION; 3263 } 3264 3265 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, 3266 enum NvmeZoneProcessingMask proc_mask, 3267 op_handler_t op_hndlr, NvmeRequest *req) 3268 { 3269 uint16_t status = NVME_SUCCESS; 3270 NvmeZoneState zs = nvme_get_zone_state(zone); 3271 bool proc_zone; 3272 3273 switch (zs) { 3274 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 3275 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 3276 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES; 3277 break; 3278 case NVME_ZONE_STATE_CLOSED: 3279 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES; 3280 break; 3281 case NVME_ZONE_STATE_READ_ONLY: 3282 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES; 3283 break; 3284 case NVME_ZONE_STATE_FULL: 3285 proc_zone = proc_mask & NVME_PROC_FULL_ZONES; 3286 break; 3287 default: 3288 proc_zone = false; 3289 } 3290 3291 if (proc_zone) { 3292 status = op_hndlr(ns, zone, zs, req); 3293 } 3294 3295 return status; 3296 } 3297 3298 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone, 3299 enum NvmeZoneProcessingMask proc_mask, 3300 op_handler_t op_hndlr, NvmeRequest *req) 3301 { 3302 NvmeZone *next; 3303 uint16_t status = NVME_SUCCESS; 3304 int i; 3305 3306 if (!proc_mask) { 3307 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req); 3308 } else { 3309 if (proc_mask & NVME_PROC_CLOSED_ZONES) { 3310 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { 3311 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3312 req); 3313 if (status && status != NVME_NO_COMPLETE) { 3314 goto out; 3315 } 3316 } 3317 } 3318 if (proc_mask & NVME_PROC_OPENED_ZONES) { 3319 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { 3320 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3321 req); 3322 if (status && status != NVME_NO_COMPLETE) { 3323 goto out; 3324 } 3325 } 3326 3327 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { 3328 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3329 req); 3330 if (status && status != NVME_NO_COMPLETE) { 3331 goto out; 3332 } 3333 } 3334 } 3335 if (proc_mask & NVME_PROC_FULL_ZONES) { 3336 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) { 3337 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3338 req); 3339 if (status && status != NVME_NO_COMPLETE) { 3340 goto out; 3341 } 3342 } 3343 } 3344 3345 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) { 3346 for (i = 0; i < ns->num_zones; i++, zone++) { 3347 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3348 req); 3349 if (status && status != NVME_NO_COMPLETE) { 3350 goto out; 3351 } 3352 } 3353 } 3354 } 3355 3356 out: 3357 return status; 3358 } 3359 3360 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) 3361 { 3362 NvmeCmd *cmd = (NvmeCmd *)&req->cmd; 3363 NvmeNamespace *ns = req->ns; 3364 NvmeZone *zone; 3365 uintptr_t *resets; 3366 uint8_t *zd_ext; 3367 uint32_t dw13 = le32_to_cpu(cmd->cdw13); 3368 uint64_t slba = 0; 3369 uint32_t zone_idx = 0; 3370 uint16_t status; 3371 uint8_t action; 3372 bool all; 3373 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE; 3374 3375 action = dw13 & 0xff; 3376 all = dw13 & 0x100; 3377 3378 req->status = NVME_SUCCESS; 3379 3380 if (!all) { 3381 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx); 3382 if (status) { 3383 return status; 3384 } 3385 } 3386 3387 zone = &ns->zone_array[zone_idx]; 3388 if (slba != zone->d.zslba) { 3389 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba); 3390 return NVME_INVALID_FIELD | NVME_DNR; 3391 } 3392 3393 switch (action) { 3394 3395 case NVME_ZONE_ACTION_OPEN: 3396 if (all) { 3397 proc_mask = NVME_PROC_CLOSED_ZONES; 3398 } 3399 trace_pci_nvme_open_zone(slba, zone_idx, all); 3400 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req); 3401 break; 3402 3403 case NVME_ZONE_ACTION_CLOSE: 3404 if (all) { 3405 proc_mask = NVME_PROC_OPENED_ZONES; 3406 } 3407 trace_pci_nvme_close_zone(slba, zone_idx, all); 3408 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req); 3409 break; 3410 3411 case NVME_ZONE_ACTION_FINISH: 3412 if (all) { 3413 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES; 3414 } 3415 trace_pci_nvme_finish_zone(slba, zone_idx, all); 3416 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req); 3417 break; 3418 3419 case NVME_ZONE_ACTION_RESET: 3420 resets = (uintptr_t *)&req->opaque; 3421 3422 if (all) { 3423 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES | 3424 NVME_PROC_FULL_ZONES; 3425 } 3426 trace_pci_nvme_reset_zone(slba, zone_idx, all); 3427 3428 *resets = 1; 3429 3430 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req); 3431 3432 (*resets)--; 3433 3434 return *resets ? NVME_NO_COMPLETE : req->status; 3435 3436 case NVME_ZONE_ACTION_OFFLINE: 3437 if (all) { 3438 proc_mask = NVME_PROC_READ_ONLY_ZONES; 3439 } 3440 trace_pci_nvme_offline_zone(slba, zone_idx, all); 3441 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req); 3442 break; 3443 3444 case NVME_ZONE_ACTION_SET_ZD_EXT: 3445 trace_pci_nvme_set_descriptor_extension(slba, zone_idx); 3446 if (all || !ns->params.zd_extension_size) { 3447 return NVME_INVALID_FIELD | NVME_DNR; 3448 } 3449 zd_ext = nvme_get_zd_extension(ns, zone_idx); 3450 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req); 3451 if (status) { 3452 trace_pci_nvme_err_zd_extension_map_error(zone_idx); 3453 return status; 3454 } 3455 3456 status = nvme_set_zd_ext(ns, zone); 3457 if (status == NVME_SUCCESS) { 3458 trace_pci_nvme_zd_extension_set(zone_idx); 3459 return status; 3460 } 3461 break; 3462 3463 default: 3464 trace_pci_nvme_err_invalid_mgmt_action(action); 3465 status = NVME_INVALID_FIELD; 3466 } 3467 3468 if (status == NVME_ZONE_INVAL_TRANSITION) { 3469 trace_pci_nvme_err_invalid_zone_state_transition(action, slba, 3470 zone->d.za); 3471 } 3472 if (status) { 3473 status |= NVME_DNR; 3474 } 3475 3476 return status; 3477 } 3478 3479 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl) 3480 { 3481 NvmeZoneState zs = nvme_get_zone_state(zl); 3482 3483 switch (zafs) { 3484 case NVME_ZONE_REPORT_ALL: 3485 return true; 3486 case NVME_ZONE_REPORT_EMPTY: 3487 return zs == NVME_ZONE_STATE_EMPTY; 3488 case NVME_ZONE_REPORT_IMPLICITLY_OPEN: 3489 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN; 3490 case NVME_ZONE_REPORT_EXPLICITLY_OPEN: 3491 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN; 3492 case NVME_ZONE_REPORT_CLOSED: 3493 return zs == NVME_ZONE_STATE_CLOSED; 3494 case NVME_ZONE_REPORT_FULL: 3495 return zs == NVME_ZONE_STATE_FULL; 3496 case NVME_ZONE_REPORT_READ_ONLY: 3497 return zs == NVME_ZONE_STATE_READ_ONLY; 3498 case NVME_ZONE_REPORT_OFFLINE: 3499 return zs == NVME_ZONE_STATE_OFFLINE; 3500 default: 3501 return false; 3502 } 3503 } 3504 3505 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) 3506 { 3507 NvmeCmd *cmd = (NvmeCmd *)&req->cmd; 3508 NvmeNamespace *ns = req->ns; 3509 /* cdw12 is zero-based number of dwords to return. Convert to bytes */ 3510 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2; 3511 uint32_t dw13 = le32_to_cpu(cmd->cdw13); 3512 uint32_t zone_idx, zra, zrasf, partial; 3513 uint64_t max_zones, nr_zones = 0; 3514 uint16_t status; 3515 uint64_t slba; 3516 NvmeZoneDescr *z; 3517 NvmeZone *zone; 3518 NvmeZoneReportHeader *header; 3519 void *buf, *buf_p; 3520 size_t zone_entry_sz; 3521 int i; 3522 3523 req->status = NVME_SUCCESS; 3524 3525 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx); 3526 if (status) { 3527 return status; 3528 } 3529 3530 zra = dw13 & 0xff; 3531 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) { 3532 return NVME_INVALID_FIELD | NVME_DNR; 3533 } 3534 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) { 3535 return NVME_INVALID_FIELD | NVME_DNR; 3536 } 3537 3538 zrasf = (dw13 >> 8) & 0xff; 3539 if (zrasf > NVME_ZONE_REPORT_OFFLINE) { 3540 return NVME_INVALID_FIELD | NVME_DNR; 3541 } 3542 3543 if (data_size < sizeof(NvmeZoneReportHeader)) { 3544 return NVME_INVALID_FIELD | NVME_DNR; 3545 } 3546 3547 status = nvme_check_mdts(n, data_size); 3548 if (status) { 3549 return status; 3550 } 3551 3552 partial = (dw13 >> 16) & 0x01; 3553 3554 zone_entry_sz = sizeof(NvmeZoneDescr); 3555 if (zra == NVME_ZONE_REPORT_EXTENDED) { 3556 zone_entry_sz += ns->params.zd_extension_size; 3557 } 3558 3559 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz; 3560 buf = g_malloc0(data_size); 3561 3562 zone = &ns->zone_array[zone_idx]; 3563 for (i = zone_idx; i < ns->num_zones; i++) { 3564 if (partial && nr_zones >= max_zones) { 3565 break; 3566 } 3567 if (nvme_zone_matches_filter(zrasf, zone++)) { 3568 nr_zones++; 3569 } 3570 } 3571 header = (NvmeZoneReportHeader *)buf; 3572 header->nr_zones = cpu_to_le64(nr_zones); 3573 3574 buf_p = buf + sizeof(NvmeZoneReportHeader); 3575 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) { 3576 zone = &ns->zone_array[zone_idx]; 3577 if (nvme_zone_matches_filter(zrasf, zone)) { 3578 z = (NvmeZoneDescr *)buf_p; 3579 buf_p += sizeof(NvmeZoneDescr); 3580 3581 z->zt = zone->d.zt; 3582 z->zs = zone->d.zs; 3583 z->zcap = cpu_to_le64(zone->d.zcap); 3584 z->zslba = cpu_to_le64(zone->d.zslba); 3585 z->za = zone->d.za; 3586 3587 if (nvme_wp_is_valid(zone)) { 3588 z->wp = cpu_to_le64(zone->d.wp); 3589 } else { 3590 z->wp = cpu_to_le64(~0ULL); 3591 } 3592 3593 if (zra == NVME_ZONE_REPORT_EXTENDED) { 3594 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) { 3595 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx), 3596 ns->params.zd_extension_size); 3597 } 3598 buf_p += ns->params.zd_extension_size; 3599 } 3600 3601 max_zones--; 3602 } 3603 } 3604 3605 status = nvme_c2h(n, (uint8_t *)buf, data_size, req); 3606 3607 g_free(buf); 3608 3609 return status; 3610 } 3611 3612 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) 3613 { 3614 NvmeNamespace *ns; 3615 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 3616 3617 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), 3618 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode)); 3619 3620 if (!nvme_nsid_valid(n, nsid)) { 3621 return NVME_INVALID_NSID | NVME_DNR; 3622 } 3623 3624 /* 3625 * In the base NVM command set, Flush may apply to all namespaces 3626 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used 3627 * along with TP 4056 (Namespace Types), it may be pretty screwed up. 3628 * 3629 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the 3630 * opcode with a specific command since we cannot determine a unique I/O 3631 * command set. Opcode 0h could have any other meaning than something 3632 * equivalent to flushing and say it DOES have completely different 3633 * semantics in some other command set - does an NSID of FFFFFFFFh then 3634 * mean "for all namespaces, apply whatever command set specific command 3635 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply 3636 * whatever command that uses the 0h opcode if, and only if, it allows NSID 3637 * to be FFFFFFFFh"? 3638 * 3639 * Anyway (and luckily), for now, we do not care about this since the 3640 * device only supports namespace types that includes the NVM Flush command 3641 * (NVM and Zoned), so always do an NVM Flush. 3642 */ 3643 if (req->cmd.opcode == NVME_CMD_FLUSH) { 3644 return nvme_flush(n, req); 3645 } 3646 3647 ns = nvme_ns(n, nsid); 3648 if (unlikely(!ns)) { 3649 return NVME_INVALID_FIELD | NVME_DNR; 3650 } 3651 3652 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { 3653 trace_pci_nvme_err_invalid_opc(req->cmd.opcode); 3654 return NVME_INVALID_OPCODE | NVME_DNR; 3655 } 3656 3657 if (ns->status) { 3658 return ns->status; 3659 } 3660 3661 req->ns = ns; 3662 3663 switch (req->cmd.opcode) { 3664 case NVME_CMD_WRITE_ZEROES: 3665 return nvme_write_zeroes(n, req); 3666 case NVME_CMD_ZONE_APPEND: 3667 return nvme_zone_append(n, req); 3668 case NVME_CMD_WRITE: 3669 return nvme_write(n, req); 3670 case NVME_CMD_READ: 3671 return nvme_read(n, req); 3672 case NVME_CMD_COMPARE: 3673 return nvme_compare(n, req); 3674 case NVME_CMD_DSM: 3675 return nvme_dsm(n, req); 3676 case NVME_CMD_VERIFY: 3677 return nvme_verify(n, req); 3678 case NVME_CMD_COPY: 3679 return nvme_copy(n, req); 3680 case NVME_CMD_ZONE_MGMT_SEND: 3681 return nvme_zone_mgmt_send(n, req); 3682 case NVME_CMD_ZONE_MGMT_RECV: 3683 return nvme_zone_mgmt_recv(n, req); 3684 default: 3685 assert(false); 3686 } 3687 3688 return NVME_INVALID_OPCODE | NVME_DNR; 3689 } 3690 3691 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) 3692 { 3693 n->sq[sq->sqid] = NULL; 3694 timer_free(sq->timer); 3695 g_free(sq->io_req); 3696 if (sq->sqid) { 3697 g_free(sq); 3698 } 3699 } 3700 3701 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req) 3702 { 3703 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd; 3704 NvmeRequest *r, *next; 3705 NvmeSQueue *sq; 3706 NvmeCQueue *cq; 3707 uint16_t qid = le16_to_cpu(c->qid); 3708 uint32_t nsid; 3709 3710 if (unlikely(!qid || nvme_check_sqid(n, qid))) { 3711 trace_pci_nvme_err_invalid_del_sq(qid); 3712 return NVME_INVALID_QID | NVME_DNR; 3713 } 3714 3715 trace_pci_nvme_del_sq(qid); 3716 3717 sq = n->sq[qid]; 3718 while (!QTAILQ_EMPTY(&sq->out_req_list)) { 3719 r = QTAILQ_FIRST(&sq->out_req_list); 3720 if (r->aiocb) { 3721 blk_aio_cancel(r->aiocb); 3722 } 3723 } 3724 3725 /* 3726 * Drain all namespaces if there are still outstanding requests that we 3727 * could not cancel explicitly. 3728 */ 3729 if (!QTAILQ_EMPTY(&sq->out_req_list)) { 3730 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) { 3731 NvmeNamespace *ns = nvme_ns(n, nsid); 3732 if (ns) { 3733 nvme_ns_drain(ns); 3734 } 3735 } 3736 } 3737 3738 assert(QTAILQ_EMPTY(&sq->out_req_list)); 3739 3740 if (!nvme_check_cqid(n, sq->cqid)) { 3741 cq = n->cq[sq->cqid]; 3742 QTAILQ_REMOVE(&cq->sq_list, sq, entry); 3743 3744 nvme_post_cqes(cq); 3745 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) { 3746 if (r->sq == sq) { 3747 QTAILQ_REMOVE(&cq->req_list, r, entry); 3748 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry); 3749 } 3750 } 3751 } 3752 3753 nvme_free_sq(sq, n); 3754 return NVME_SUCCESS; 3755 } 3756 3757 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, 3758 uint16_t sqid, uint16_t cqid, uint16_t size) 3759 { 3760 int i; 3761 NvmeCQueue *cq; 3762 3763 sq->ctrl = n; 3764 sq->dma_addr = dma_addr; 3765 sq->sqid = sqid; 3766 sq->size = size; 3767 sq->cqid = cqid; 3768 sq->head = sq->tail = 0; 3769 sq->io_req = g_new0(NvmeRequest, sq->size); 3770 3771 QTAILQ_INIT(&sq->req_list); 3772 QTAILQ_INIT(&sq->out_req_list); 3773 for (i = 0; i < sq->size; i++) { 3774 sq->io_req[i].sq = sq; 3775 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry); 3776 } 3777 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq); 3778 3779 assert(n->cq[cqid]); 3780 cq = n->cq[cqid]; 3781 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry); 3782 n->sq[sqid] = sq; 3783 } 3784 3785 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req) 3786 { 3787 NvmeSQueue *sq; 3788 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd; 3789 3790 uint16_t cqid = le16_to_cpu(c->cqid); 3791 uint16_t sqid = le16_to_cpu(c->sqid); 3792 uint16_t qsize = le16_to_cpu(c->qsize); 3793 uint16_t qflags = le16_to_cpu(c->sq_flags); 3794 uint64_t prp1 = le64_to_cpu(c->prp1); 3795 3796 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags); 3797 3798 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) { 3799 trace_pci_nvme_err_invalid_create_sq_cqid(cqid); 3800 return NVME_INVALID_CQID | NVME_DNR; 3801 } 3802 if (unlikely(!sqid || sqid > n->params.max_ioqpairs || 3803 n->sq[sqid] != NULL)) { 3804 trace_pci_nvme_err_invalid_create_sq_sqid(sqid); 3805 return NVME_INVALID_QID | NVME_DNR; 3806 } 3807 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) { 3808 trace_pci_nvme_err_invalid_create_sq_size(qsize); 3809 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; 3810 } 3811 if (unlikely(prp1 & (n->page_size - 1))) { 3812 trace_pci_nvme_err_invalid_create_sq_addr(prp1); 3813 return NVME_INVALID_PRP_OFFSET | NVME_DNR; 3814 } 3815 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) { 3816 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags)); 3817 return NVME_INVALID_FIELD | NVME_DNR; 3818 } 3819 sq = g_malloc0(sizeof(*sq)); 3820 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1); 3821 return NVME_SUCCESS; 3822 } 3823 3824 struct nvme_stats { 3825 uint64_t units_read; 3826 uint64_t units_written; 3827 uint64_t read_commands; 3828 uint64_t write_commands; 3829 }; 3830 3831 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats) 3832 { 3833 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk); 3834 3835 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS; 3836 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS; 3837 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ]; 3838 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE]; 3839 } 3840 3841 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, 3842 uint64_t off, NvmeRequest *req) 3843 { 3844 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 3845 struct nvme_stats stats = { 0 }; 3846 NvmeSmartLog smart = { 0 }; 3847 uint32_t trans_len; 3848 NvmeNamespace *ns; 3849 time_t current_ms; 3850 3851 if (off >= sizeof(smart)) { 3852 return NVME_INVALID_FIELD | NVME_DNR; 3853 } 3854 3855 if (nsid != 0xffffffff) { 3856 ns = nvme_ns(n, nsid); 3857 if (!ns) { 3858 return NVME_INVALID_NSID | NVME_DNR; 3859 } 3860 nvme_set_blk_stats(ns, &stats); 3861 } else { 3862 int i; 3863 3864 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 3865 ns = nvme_ns(n, i); 3866 if (!ns) { 3867 continue; 3868 } 3869 nvme_set_blk_stats(ns, &stats); 3870 } 3871 } 3872 3873 trans_len = MIN(sizeof(smart) - off, buf_len); 3874 smart.critical_warning = n->smart_critical_warning; 3875 3876 smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read, 3877 1000)); 3878 smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written, 3879 1000)); 3880 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands); 3881 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands); 3882 3883 smart.temperature = cpu_to_le16(n->temperature); 3884 3885 if ((n->temperature >= n->features.temp_thresh_hi) || 3886 (n->temperature <= n->features.temp_thresh_low)) { 3887 smart.critical_warning |= NVME_SMART_TEMPERATURE; 3888 } 3889 3890 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 3891 smart.power_on_hours[0] = 3892 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60); 3893 3894 if (!rae) { 3895 nvme_clear_events(n, NVME_AER_TYPE_SMART); 3896 } 3897 3898 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req); 3899 } 3900 3901 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, 3902 NvmeRequest *req) 3903 { 3904 uint32_t trans_len; 3905 NvmeFwSlotInfoLog fw_log = { 3906 .afi = 0x1, 3907 }; 3908 3909 if (off >= sizeof(fw_log)) { 3910 return NVME_INVALID_FIELD | NVME_DNR; 3911 } 3912 3913 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' '); 3914 trans_len = MIN(sizeof(fw_log) - off, buf_len); 3915 3916 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req); 3917 } 3918 3919 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, 3920 uint64_t off, NvmeRequest *req) 3921 { 3922 uint32_t trans_len; 3923 NvmeErrorLog errlog; 3924 3925 if (off >= sizeof(errlog)) { 3926 return NVME_INVALID_FIELD | NVME_DNR; 3927 } 3928 3929 if (!rae) { 3930 nvme_clear_events(n, NVME_AER_TYPE_ERROR); 3931 } 3932 3933 memset(&errlog, 0x0, sizeof(errlog)); 3934 trans_len = MIN(sizeof(errlog) - off, buf_len); 3935 3936 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req); 3937 } 3938 3939 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, 3940 uint64_t off, NvmeRequest *req) 3941 { 3942 uint32_t nslist[1024]; 3943 uint32_t trans_len; 3944 int i = 0; 3945 uint32_t nsid; 3946 3947 memset(nslist, 0x0, sizeof(nslist)); 3948 trans_len = MIN(sizeof(nslist) - off, buf_len); 3949 3950 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) != 3951 NVME_CHANGED_NSID_SIZE) { 3952 /* 3953 * If more than 1024 namespaces, the first entry in the log page should 3954 * be set to FFFFFFFFh and the others to 0 as spec. 3955 */ 3956 if (i == ARRAY_SIZE(nslist)) { 3957 memset(nslist, 0x0, sizeof(nslist)); 3958 nslist[0] = 0xffffffff; 3959 break; 3960 } 3961 3962 nslist[i++] = nsid; 3963 clear_bit(nsid, n->changed_nsids); 3964 } 3965 3966 /* 3967 * Remove all the remaining list entries in case returns directly due to 3968 * more than 1024 namespaces. 3969 */ 3970 if (nslist[0] == 0xffffffff) { 3971 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE); 3972 } 3973 3974 if (!rae) { 3975 nvme_clear_events(n, NVME_AER_TYPE_NOTICE); 3976 } 3977 3978 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req); 3979 } 3980 3981 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, 3982 uint64_t off, NvmeRequest *req) 3983 { 3984 NvmeEffectsLog log = {}; 3985 const uint32_t *src_iocs = NULL; 3986 uint32_t trans_len; 3987 3988 if (off >= sizeof(log)) { 3989 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log)); 3990 return NVME_INVALID_FIELD | NVME_DNR; 3991 } 3992 3993 switch (NVME_CC_CSS(n->bar.cc)) { 3994 case NVME_CC_CSS_NVM: 3995 src_iocs = nvme_cse_iocs_nvm; 3996 /* fall through */ 3997 case NVME_CC_CSS_ADMIN_ONLY: 3998 break; 3999 case NVME_CC_CSS_CSI: 4000 switch (csi) { 4001 case NVME_CSI_NVM: 4002 src_iocs = nvme_cse_iocs_nvm; 4003 break; 4004 case NVME_CSI_ZONED: 4005 src_iocs = nvme_cse_iocs_zoned; 4006 break; 4007 } 4008 } 4009 4010 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs)); 4011 4012 if (src_iocs) { 4013 memcpy(log.iocs, src_iocs, sizeof(log.iocs)); 4014 } 4015 4016 trans_len = MIN(sizeof(log) - off, buf_len); 4017 4018 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req); 4019 } 4020 4021 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) 4022 { 4023 NvmeCmd *cmd = &req->cmd; 4024 4025 uint32_t dw10 = le32_to_cpu(cmd->cdw10); 4026 uint32_t dw11 = le32_to_cpu(cmd->cdw11); 4027 uint32_t dw12 = le32_to_cpu(cmd->cdw12); 4028 uint32_t dw13 = le32_to_cpu(cmd->cdw13); 4029 uint8_t lid = dw10 & 0xff; 4030 uint8_t lsp = (dw10 >> 8) & 0xf; 4031 uint8_t rae = (dw10 >> 15) & 0x1; 4032 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24; 4033 uint32_t numdl, numdu; 4034 uint64_t off, lpol, lpou; 4035 size_t len; 4036 uint16_t status; 4037 4038 numdl = (dw10 >> 16); 4039 numdu = (dw11 & 0xffff); 4040 lpol = dw12; 4041 lpou = dw13; 4042 4043 len = (((numdu << 16) | numdl) + 1) << 2; 4044 off = (lpou << 32ULL) | lpol; 4045 4046 if (off & 0x3) { 4047 return NVME_INVALID_FIELD | NVME_DNR; 4048 } 4049 4050 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off); 4051 4052 status = nvme_check_mdts(n, len); 4053 if (status) { 4054 return status; 4055 } 4056 4057 switch (lid) { 4058 case NVME_LOG_ERROR_INFO: 4059 return nvme_error_info(n, rae, len, off, req); 4060 case NVME_LOG_SMART_INFO: 4061 return nvme_smart_info(n, rae, len, off, req); 4062 case NVME_LOG_FW_SLOT_INFO: 4063 return nvme_fw_log_info(n, len, off, req); 4064 case NVME_LOG_CHANGED_NSLIST: 4065 return nvme_changed_nslist(n, rae, len, off, req); 4066 case NVME_LOG_CMD_EFFECTS: 4067 return nvme_cmd_effects(n, csi, len, off, req); 4068 default: 4069 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid); 4070 return NVME_INVALID_FIELD | NVME_DNR; 4071 } 4072 } 4073 4074 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) 4075 { 4076 n->cq[cq->cqid] = NULL; 4077 timer_free(cq->timer); 4078 if (msix_enabled(&n->parent_obj)) { 4079 msix_vector_unuse(&n->parent_obj, cq->vector); 4080 } 4081 if (cq->cqid) { 4082 g_free(cq); 4083 } 4084 } 4085 4086 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req) 4087 { 4088 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd; 4089 NvmeCQueue *cq; 4090 uint16_t qid = le16_to_cpu(c->qid); 4091 4092 if (unlikely(!qid || nvme_check_cqid(n, qid))) { 4093 trace_pci_nvme_err_invalid_del_cq_cqid(qid); 4094 return NVME_INVALID_CQID | NVME_DNR; 4095 } 4096 4097 cq = n->cq[qid]; 4098 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) { 4099 trace_pci_nvme_err_invalid_del_cq_notempty(qid); 4100 return NVME_INVALID_QUEUE_DEL; 4101 } 4102 nvme_irq_deassert(n, cq); 4103 trace_pci_nvme_del_cq(qid); 4104 nvme_free_cq(cq, n); 4105 return NVME_SUCCESS; 4106 } 4107 4108 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, 4109 uint16_t cqid, uint16_t vector, uint16_t size, 4110 uint16_t irq_enabled) 4111 { 4112 int ret; 4113 4114 if (msix_enabled(&n->parent_obj)) { 4115 ret = msix_vector_use(&n->parent_obj, vector); 4116 assert(ret == 0); 4117 } 4118 cq->ctrl = n; 4119 cq->cqid = cqid; 4120 cq->size = size; 4121 cq->dma_addr = dma_addr; 4122 cq->phase = 1; 4123 cq->irq_enabled = irq_enabled; 4124 cq->vector = vector; 4125 cq->head = cq->tail = 0; 4126 QTAILQ_INIT(&cq->req_list); 4127 QTAILQ_INIT(&cq->sq_list); 4128 n->cq[cqid] = cq; 4129 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); 4130 } 4131 4132 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) 4133 { 4134 NvmeCQueue *cq; 4135 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd; 4136 uint16_t cqid = le16_to_cpu(c->cqid); 4137 uint16_t vector = le16_to_cpu(c->irq_vector); 4138 uint16_t qsize = le16_to_cpu(c->qsize); 4139 uint16_t qflags = le16_to_cpu(c->cq_flags); 4140 uint64_t prp1 = le64_to_cpu(c->prp1); 4141 4142 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags, 4143 NVME_CQ_FLAGS_IEN(qflags) != 0); 4144 4145 if (unlikely(!cqid || cqid > n->params.max_ioqpairs || 4146 n->cq[cqid] != NULL)) { 4147 trace_pci_nvme_err_invalid_create_cq_cqid(cqid); 4148 return NVME_INVALID_QID | NVME_DNR; 4149 } 4150 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) { 4151 trace_pci_nvme_err_invalid_create_cq_size(qsize); 4152 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; 4153 } 4154 if (unlikely(prp1 & (n->page_size - 1))) { 4155 trace_pci_nvme_err_invalid_create_cq_addr(prp1); 4156 return NVME_INVALID_PRP_OFFSET | NVME_DNR; 4157 } 4158 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) { 4159 trace_pci_nvme_err_invalid_create_cq_vector(vector); 4160 return NVME_INVALID_IRQ_VECTOR | NVME_DNR; 4161 } 4162 if (unlikely(vector >= n->params.msix_qsize)) { 4163 trace_pci_nvme_err_invalid_create_cq_vector(vector); 4164 return NVME_INVALID_IRQ_VECTOR | NVME_DNR; 4165 } 4166 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) { 4167 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags)); 4168 return NVME_INVALID_FIELD | NVME_DNR; 4169 } 4170 4171 cq = g_malloc0(sizeof(*cq)); 4172 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1, 4173 NVME_CQ_FLAGS_IEN(qflags)); 4174 4175 /* 4176 * It is only required to set qs_created when creating a completion queue; 4177 * creating a submission queue without a matching completion queue will 4178 * fail. 4179 */ 4180 n->qs_created = true; 4181 return NVME_SUCCESS; 4182 } 4183 4184 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req) 4185 { 4186 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; 4187 4188 return nvme_c2h(n, id, sizeof(id), req); 4189 } 4190 4191 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) 4192 { 4193 trace_pci_nvme_identify_ctrl(); 4194 4195 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req); 4196 } 4197 4198 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) 4199 { 4200 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4201 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; 4202 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id; 4203 4204 trace_pci_nvme_identify_ctrl_csi(c->csi); 4205 4206 switch (c->csi) { 4207 case NVME_CSI_NVM: 4208 id_nvm->vsl = n->params.vsl; 4209 id_nvm->dmrsl = cpu_to_le32(n->dmrsl); 4210 break; 4211 4212 case NVME_CSI_ZONED: 4213 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl; 4214 break; 4215 4216 default: 4217 return NVME_INVALID_FIELD | NVME_DNR; 4218 } 4219 4220 return nvme_c2h(n, id, sizeof(id), req); 4221 } 4222 4223 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active) 4224 { 4225 NvmeNamespace *ns; 4226 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4227 uint32_t nsid = le32_to_cpu(c->nsid); 4228 4229 trace_pci_nvme_identify_ns(nsid); 4230 4231 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 4232 return NVME_INVALID_NSID | NVME_DNR; 4233 } 4234 4235 ns = nvme_ns(n, nsid); 4236 if (unlikely(!ns)) { 4237 if (!active) { 4238 ns = nvme_subsys_ns(n->subsys, nsid); 4239 if (!ns) { 4240 return nvme_rpt_empty_id_struct(n, req); 4241 } 4242 } else { 4243 return nvme_rpt_empty_id_struct(n, req); 4244 } 4245 } 4246 4247 if (active || ns->csi == NVME_CSI_NVM) { 4248 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req); 4249 } 4250 4251 return NVME_INVALID_CMD_SET | NVME_DNR; 4252 } 4253 4254 static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req) 4255 { 4256 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4257 uint16_t min_id = le16_to_cpu(c->ctrlid); 4258 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; 4259 uint16_t *ids = &list[1]; 4260 NvmeNamespace *ns; 4261 NvmeCtrl *ctrl; 4262 int cntlid, nr_ids = 0; 4263 4264 trace_pci_nvme_identify_ns_attached_list(min_id); 4265 4266 if (c->nsid == NVME_NSID_BROADCAST) { 4267 return NVME_INVALID_FIELD | NVME_DNR; 4268 } 4269 4270 ns = nvme_subsys_ns(n->subsys, c->nsid); 4271 if (!ns) { 4272 return NVME_INVALID_FIELD | NVME_DNR; 4273 } 4274 4275 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) { 4276 ctrl = nvme_subsys_ctrl(n->subsys, cntlid); 4277 if (!ctrl) { 4278 continue; 4279 } 4280 4281 if (!nvme_ns(ctrl, c->nsid)) { 4282 continue; 4283 } 4284 4285 ids[nr_ids++] = cntlid; 4286 } 4287 4288 list[0] = nr_ids; 4289 4290 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req); 4291 } 4292 4293 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req, 4294 bool active) 4295 { 4296 NvmeNamespace *ns; 4297 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4298 uint32_t nsid = le32_to_cpu(c->nsid); 4299 4300 trace_pci_nvme_identify_ns_csi(nsid, c->csi); 4301 4302 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 4303 return NVME_INVALID_NSID | NVME_DNR; 4304 } 4305 4306 ns = nvme_ns(n, nsid); 4307 if (unlikely(!ns)) { 4308 if (!active) { 4309 ns = nvme_subsys_ns(n->subsys, nsid); 4310 if (!ns) { 4311 return nvme_rpt_empty_id_struct(n, req); 4312 } 4313 } else { 4314 return nvme_rpt_empty_id_struct(n, req); 4315 } 4316 } 4317 4318 if (c->csi == NVME_CSI_NVM) { 4319 return nvme_rpt_empty_id_struct(n, req); 4320 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) { 4321 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned), 4322 req); 4323 } 4324 4325 return NVME_INVALID_FIELD | NVME_DNR; 4326 } 4327 4328 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req, 4329 bool active) 4330 { 4331 NvmeNamespace *ns; 4332 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4333 uint32_t min_nsid = le32_to_cpu(c->nsid); 4334 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 4335 static const int data_len = sizeof(list); 4336 uint32_t *list_ptr = (uint32_t *)list; 4337 int i, j = 0; 4338 4339 trace_pci_nvme_identify_nslist(min_nsid); 4340 4341 /* 4342 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values 4343 * since the Active Namespace ID List should return namespaces with ids 4344 * *higher* than the NSID specified in the command. This is also specified 4345 * in the spec (NVM Express v1.3d, Section 5.15.4). 4346 */ 4347 if (min_nsid >= NVME_NSID_BROADCAST - 1) { 4348 return NVME_INVALID_NSID | NVME_DNR; 4349 } 4350 4351 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4352 ns = nvme_ns(n, i); 4353 if (!ns) { 4354 if (!active) { 4355 ns = nvme_subsys_ns(n->subsys, i); 4356 if (!ns) { 4357 continue; 4358 } 4359 } else { 4360 continue; 4361 } 4362 } 4363 if (ns->params.nsid <= min_nsid) { 4364 continue; 4365 } 4366 list_ptr[j++] = cpu_to_le32(ns->params.nsid); 4367 if (j == data_len / sizeof(uint32_t)) { 4368 break; 4369 } 4370 } 4371 4372 return nvme_c2h(n, list, data_len, req); 4373 } 4374 4375 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req, 4376 bool active) 4377 { 4378 NvmeNamespace *ns; 4379 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4380 uint32_t min_nsid = le32_to_cpu(c->nsid); 4381 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 4382 static const int data_len = sizeof(list); 4383 uint32_t *list_ptr = (uint32_t *)list; 4384 int i, j = 0; 4385 4386 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi); 4387 4388 /* 4389 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid. 4390 */ 4391 if (min_nsid >= NVME_NSID_BROADCAST - 1) { 4392 return NVME_INVALID_NSID | NVME_DNR; 4393 } 4394 4395 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) { 4396 return NVME_INVALID_FIELD | NVME_DNR; 4397 } 4398 4399 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4400 ns = nvme_ns(n, i); 4401 if (!ns) { 4402 if (!active) { 4403 ns = nvme_subsys_ns(n->subsys, i); 4404 if (!ns) { 4405 continue; 4406 } 4407 } else { 4408 continue; 4409 } 4410 } 4411 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) { 4412 continue; 4413 } 4414 list_ptr[j++] = cpu_to_le32(ns->params.nsid); 4415 if (j == data_len / sizeof(uint32_t)) { 4416 break; 4417 } 4418 } 4419 4420 return nvme_c2h(n, list, data_len, req); 4421 } 4422 4423 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) 4424 { 4425 NvmeNamespace *ns; 4426 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4427 uint32_t nsid = le32_to_cpu(c->nsid); 4428 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 4429 uint8_t *pos = list; 4430 struct { 4431 NvmeIdNsDescr hdr; 4432 uint8_t v[NVME_NIDL_UUID]; 4433 } QEMU_PACKED uuid; 4434 struct { 4435 NvmeIdNsDescr hdr; 4436 uint64_t v; 4437 } QEMU_PACKED eui64; 4438 struct { 4439 NvmeIdNsDescr hdr; 4440 uint8_t v; 4441 } QEMU_PACKED csi; 4442 4443 trace_pci_nvme_identify_ns_descr_list(nsid); 4444 4445 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 4446 return NVME_INVALID_NSID | NVME_DNR; 4447 } 4448 4449 ns = nvme_ns(n, nsid); 4450 if (unlikely(!ns)) { 4451 return NVME_INVALID_FIELD | NVME_DNR; 4452 } 4453 4454 /* 4455 * If the EUI-64 field is 0 and the NGUID field is 0, the namespace must 4456 * provide a valid Namespace UUID in the Namespace Identification Descriptor 4457 * data structure. QEMU does not yet support setting NGUID. 4458 */ 4459 uuid.hdr.nidt = NVME_NIDT_UUID; 4460 uuid.hdr.nidl = NVME_NIDL_UUID; 4461 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID); 4462 memcpy(pos, &uuid, sizeof(uuid)); 4463 pos += sizeof(uuid); 4464 4465 if (ns->params.eui64) { 4466 eui64.hdr.nidt = NVME_NIDT_EUI64; 4467 eui64.hdr.nidl = NVME_NIDL_EUI64; 4468 eui64.v = cpu_to_be64(ns->params.eui64); 4469 memcpy(pos, &eui64, sizeof(eui64)); 4470 pos += sizeof(eui64); 4471 } 4472 4473 csi.hdr.nidt = NVME_NIDT_CSI; 4474 csi.hdr.nidl = NVME_NIDL_CSI; 4475 csi.v = ns->csi; 4476 memcpy(pos, &csi, sizeof(csi)); 4477 pos += sizeof(csi); 4478 4479 return nvme_c2h(n, list, sizeof(list), req); 4480 } 4481 4482 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req) 4483 { 4484 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 4485 static const int data_len = sizeof(list); 4486 4487 trace_pci_nvme_identify_cmd_set(); 4488 4489 NVME_SET_CSI(*list, NVME_CSI_NVM); 4490 NVME_SET_CSI(*list, NVME_CSI_ZONED); 4491 4492 return nvme_c2h(n, list, data_len, req); 4493 } 4494 4495 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) 4496 { 4497 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4498 4499 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid), 4500 c->csi); 4501 4502 switch (c->cns) { 4503 case NVME_ID_CNS_NS: 4504 return nvme_identify_ns(n, req, true); 4505 case NVME_ID_CNS_NS_PRESENT: 4506 return nvme_identify_ns(n, req, false); 4507 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST: 4508 return nvme_identify_ns_attached_list(n, req); 4509 case NVME_ID_CNS_CS_NS: 4510 return nvme_identify_ns_csi(n, req, true); 4511 case NVME_ID_CNS_CS_NS_PRESENT: 4512 return nvme_identify_ns_csi(n, req, false); 4513 case NVME_ID_CNS_CTRL: 4514 return nvme_identify_ctrl(n, req); 4515 case NVME_ID_CNS_CS_CTRL: 4516 return nvme_identify_ctrl_csi(n, req); 4517 case NVME_ID_CNS_NS_ACTIVE_LIST: 4518 return nvme_identify_nslist(n, req, true); 4519 case NVME_ID_CNS_NS_PRESENT_LIST: 4520 return nvme_identify_nslist(n, req, false); 4521 case NVME_ID_CNS_CS_NS_ACTIVE_LIST: 4522 return nvme_identify_nslist_csi(n, req, true); 4523 case NVME_ID_CNS_CS_NS_PRESENT_LIST: 4524 return nvme_identify_nslist_csi(n, req, false); 4525 case NVME_ID_CNS_NS_DESCR_LIST: 4526 return nvme_identify_ns_descr_list(n, req); 4527 case NVME_ID_CNS_IO_COMMAND_SET: 4528 return nvme_identify_cmd_set(n, req); 4529 default: 4530 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns)); 4531 return NVME_INVALID_FIELD | NVME_DNR; 4532 } 4533 } 4534 4535 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req) 4536 { 4537 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff; 4538 4539 req->cqe.result = 1; 4540 if (nvme_check_sqid(n, sqid)) { 4541 return NVME_INVALID_FIELD | NVME_DNR; 4542 } 4543 4544 return NVME_SUCCESS; 4545 } 4546 4547 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts) 4548 { 4549 trace_pci_nvme_setfeat_timestamp(ts); 4550 4551 n->host_timestamp = le64_to_cpu(ts); 4552 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 4553 } 4554 4555 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n) 4556 { 4557 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 4558 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms; 4559 4560 union nvme_timestamp { 4561 struct { 4562 uint64_t timestamp:48; 4563 uint64_t sync:1; 4564 uint64_t origin:3; 4565 uint64_t rsvd1:12; 4566 }; 4567 uint64_t all; 4568 }; 4569 4570 union nvme_timestamp ts; 4571 ts.all = 0; 4572 ts.timestamp = n->host_timestamp + elapsed_time; 4573 4574 /* If the host timestamp is non-zero, set the timestamp origin */ 4575 ts.origin = n->host_timestamp ? 0x01 : 0x00; 4576 4577 trace_pci_nvme_getfeat_timestamp(ts.all); 4578 4579 return cpu_to_le64(ts.all); 4580 } 4581 4582 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) 4583 { 4584 uint64_t timestamp = nvme_get_timestamp(n); 4585 4586 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req); 4587 } 4588 4589 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) 4590 { 4591 NvmeCmd *cmd = &req->cmd; 4592 uint32_t dw10 = le32_to_cpu(cmd->cdw10); 4593 uint32_t dw11 = le32_to_cpu(cmd->cdw11); 4594 uint32_t nsid = le32_to_cpu(cmd->nsid); 4595 uint32_t result; 4596 uint8_t fid = NVME_GETSETFEAT_FID(dw10); 4597 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10); 4598 uint16_t iv; 4599 NvmeNamespace *ns; 4600 int i; 4601 4602 static const uint32_t nvme_feature_default[NVME_FID_MAX] = { 4603 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT, 4604 }; 4605 4606 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11); 4607 4608 if (!nvme_feature_support[fid]) { 4609 return NVME_INVALID_FIELD | NVME_DNR; 4610 } 4611 4612 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) { 4613 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 4614 /* 4615 * The Reservation Notification Mask and Reservation Persistence 4616 * features require a status code of Invalid Field in Command when 4617 * NSID is FFFFFFFFh. Since the device does not support those 4618 * features we can always return Invalid Namespace or Format as we 4619 * should do for all other features. 4620 */ 4621 return NVME_INVALID_NSID | NVME_DNR; 4622 } 4623 4624 if (!nvme_ns(n, nsid)) { 4625 return NVME_INVALID_FIELD | NVME_DNR; 4626 } 4627 } 4628 4629 switch (sel) { 4630 case NVME_GETFEAT_SELECT_CURRENT: 4631 break; 4632 case NVME_GETFEAT_SELECT_SAVED: 4633 /* no features are saveable by the controller; fallthrough */ 4634 case NVME_GETFEAT_SELECT_DEFAULT: 4635 goto defaults; 4636 case NVME_GETFEAT_SELECT_CAP: 4637 result = nvme_feature_cap[fid]; 4638 goto out; 4639 } 4640 4641 switch (fid) { 4642 case NVME_TEMPERATURE_THRESHOLD: 4643 result = 0; 4644 4645 /* 4646 * The controller only implements the Composite Temperature sensor, so 4647 * return 0 for all other sensors. 4648 */ 4649 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { 4650 goto out; 4651 } 4652 4653 switch (NVME_TEMP_THSEL(dw11)) { 4654 case NVME_TEMP_THSEL_OVER: 4655 result = n->features.temp_thresh_hi; 4656 goto out; 4657 case NVME_TEMP_THSEL_UNDER: 4658 result = n->features.temp_thresh_low; 4659 goto out; 4660 } 4661 4662 return NVME_INVALID_FIELD | NVME_DNR; 4663 case NVME_ERROR_RECOVERY: 4664 if (!nvme_nsid_valid(n, nsid)) { 4665 return NVME_INVALID_NSID | NVME_DNR; 4666 } 4667 4668 ns = nvme_ns(n, nsid); 4669 if (unlikely(!ns)) { 4670 return NVME_INVALID_FIELD | NVME_DNR; 4671 } 4672 4673 result = ns->features.err_rec; 4674 goto out; 4675 case NVME_VOLATILE_WRITE_CACHE: 4676 result = 0; 4677 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4678 ns = nvme_ns(n, i); 4679 if (!ns) { 4680 continue; 4681 } 4682 4683 result = blk_enable_write_cache(ns->blkconf.blk); 4684 if (result) { 4685 break; 4686 } 4687 } 4688 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled"); 4689 goto out; 4690 case NVME_ASYNCHRONOUS_EVENT_CONF: 4691 result = n->features.async_config; 4692 goto out; 4693 case NVME_TIMESTAMP: 4694 return nvme_get_feature_timestamp(n, req); 4695 default: 4696 break; 4697 } 4698 4699 defaults: 4700 switch (fid) { 4701 case NVME_TEMPERATURE_THRESHOLD: 4702 result = 0; 4703 4704 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { 4705 break; 4706 } 4707 4708 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) { 4709 result = NVME_TEMPERATURE_WARNING; 4710 } 4711 4712 break; 4713 case NVME_NUMBER_OF_QUEUES: 4714 result = (n->params.max_ioqpairs - 1) | 4715 ((n->params.max_ioqpairs - 1) << 16); 4716 trace_pci_nvme_getfeat_numq(result); 4717 break; 4718 case NVME_INTERRUPT_VECTOR_CONF: 4719 iv = dw11 & 0xffff; 4720 if (iv >= n->params.max_ioqpairs + 1) { 4721 return NVME_INVALID_FIELD | NVME_DNR; 4722 } 4723 4724 result = iv; 4725 if (iv == n->admin_cq.vector) { 4726 result |= NVME_INTVC_NOCOALESCING; 4727 } 4728 break; 4729 default: 4730 result = nvme_feature_default[fid]; 4731 break; 4732 } 4733 4734 out: 4735 req->cqe.result = cpu_to_le32(result); 4736 return NVME_SUCCESS; 4737 } 4738 4739 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) 4740 { 4741 uint16_t ret; 4742 uint64_t timestamp; 4743 4744 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req); 4745 if (ret) { 4746 return ret; 4747 } 4748 4749 nvme_set_timestamp(n, timestamp); 4750 4751 return NVME_SUCCESS; 4752 } 4753 4754 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) 4755 { 4756 NvmeNamespace *ns = NULL; 4757 4758 NvmeCmd *cmd = &req->cmd; 4759 uint32_t dw10 = le32_to_cpu(cmd->cdw10); 4760 uint32_t dw11 = le32_to_cpu(cmd->cdw11); 4761 uint32_t nsid = le32_to_cpu(cmd->nsid); 4762 uint8_t fid = NVME_GETSETFEAT_FID(dw10); 4763 uint8_t save = NVME_SETFEAT_SAVE(dw10); 4764 int i; 4765 4766 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11); 4767 4768 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) { 4769 return NVME_FID_NOT_SAVEABLE | NVME_DNR; 4770 } 4771 4772 if (!nvme_feature_support[fid]) { 4773 return NVME_INVALID_FIELD | NVME_DNR; 4774 } 4775 4776 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) { 4777 if (nsid != NVME_NSID_BROADCAST) { 4778 if (!nvme_nsid_valid(n, nsid)) { 4779 return NVME_INVALID_NSID | NVME_DNR; 4780 } 4781 4782 ns = nvme_ns(n, nsid); 4783 if (unlikely(!ns)) { 4784 return NVME_INVALID_FIELD | NVME_DNR; 4785 } 4786 } 4787 } else if (nsid && nsid != NVME_NSID_BROADCAST) { 4788 if (!nvme_nsid_valid(n, nsid)) { 4789 return NVME_INVALID_NSID | NVME_DNR; 4790 } 4791 4792 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR; 4793 } 4794 4795 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) { 4796 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; 4797 } 4798 4799 switch (fid) { 4800 case NVME_TEMPERATURE_THRESHOLD: 4801 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { 4802 break; 4803 } 4804 4805 switch (NVME_TEMP_THSEL(dw11)) { 4806 case NVME_TEMP_THSEL_OVER: 4807 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11); 4808 break; 4809 case NVME_TEMP_THSEL_UNDER: 4810 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11); 4811 break; 4812 default: 4813 return NVME_INVALID_FIELD | NVME_DNR; 4814 } 4815 4816 if ((n->temperature >= n->features.temp_thresh_hi) || 4817 (n->temperature <= n->features.temp_thresh_low)) { 4818 nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH); 4819 } 4820 4821 break; 4822 case NVME_ERROR_RECOVERY: 4823 if (nsid == NVME_NSID_BROADCAST) { 4824 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4825 ns = nvme_ns(n, i); 4826 4827 if (!ns) { 4828 continue; 4829 } 4830 4831 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) { 4832 ns->features.err_rec = dw11; 4833 } 4834 } 4835 4836 break; 4837 } 4838 4839 assert(ns); 4840 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) { 4841 ns->features.err_rec = dw11; 4842 } 4843 break; 4844 case NVME_VOLATILE_WRITE_CACHE: 4845 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4846 ns = nvme_ns(n, i); 4847 if (!ns) { 4848 continue; 4849 } 4850 4851 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) { 4852 blk_flush(ns->blkconf.blk); 4853 } 4854 4855 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1); 4856 } 4857 4858 break; 4859 4860 case NVME_NUMBER_OF_QUEUES: 4861 if (n->qs_created) { 4862 return NVME_CMD_SEQ_ERROR | NVME_DNR; 4863 } 4864 4865 /* 4866 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR 4867 * and NSQR. 4868 */ 4869 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) { 4870 return NVME_INVALID_FIELD | NVME_DNR; 4871 } 4872 4873 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1, 4874 ((dw11 >> 16) & 0xffff) + 1, 4875 n->params.max_ioqpairs, 4876 n->params.max_ioqpairs); 4877 req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) | 4878 ((n->params.max_ioqpairs - 1) << 16)); 4879 break; 4880 case NVME_ASYNCHRONOUS_EVENT_CONF: 4881 n->features.async_config = dw11; 4882 break; 4883 case NVME_TIMESTAMP: 4884 return nvme_set_feature_timestamp(n, req); 4885 case NVME_COMMAND_SET_PROFILE: 4886 if (dw11 & 0x1ff) { 4887 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff); 4888 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR; 4889 } 4890 break; 4891 default: 4892 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; 4893 } 4894 return NVME_SUCCESS; 4895 } 4896 4897 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req) 4898 { 4899 trace_pci_nvme_aer(nvme_cid(req)); 4900 4901 if (n->outstanding_aers > n->params.aerl) { 4902 trace_pci_nvme_aer_aerl_exceeded(); 4903 return NVME_AER_LIMIT_EXCEEDED; 4904 } 4905 4906 n->aer_reqs[n->outstanding_aers] = req; 4907 n->outstanding_aers++; 4908 4909 if (!QTAILQ_EMPTY(&n->aer_queue)) { 4910 nvme_process_aers(n); 4911 } 4912 4913 return NVME_NO_COMPLETE; 4914 } 4915 4916 static void nvme_update_dmrsl(NvmeCtrl *n) 4917 { 4918 int nsid; 4919 4920 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) { 4921 NvmeNamespace *ns = nvme_ns(n, nsid); 4922 if (!ns) { 4923 continue; 4924 } 4925 4926 n->dmrsl = MIN_NON_ZERO(n->dmrsl, 4927 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); 4928 } 4929 } 4930 4931 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns) 4932 { 4933 ns->iocs = nvme_cse_iocs_none; 4934 switch (ns->csi) { 4935 case NVME_CSI_NVM: 4936 if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) { 4937 ns->iocs = nvme_cse_iocs_nvm; 4938 } 4939 break; 4940 case NVME_CSI_ZONED: 4941 if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) { 4942 ns->iocs = nvme_cse_iocs_zoned; 4943 } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) { 4944 ns->iocs = nvme_cse_iocs_nvm; 4945 } 4946 break; 4947 } 4948 } 4949 4950 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req) 4951 { 4952 NvmeNamespace *ns; 4953 NvmeCtrl *ctrl; 4954 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; 4955 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 4956 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 4957 bool attach = !(dw10 & 0xf); 4958 uint16_t *nr_ids = &list[0]; 4959 uint16_t *ids = &list[1]; 4960 uint16_t ret; 4961 int i; 4962 4963 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf); 4964 4965 if (!nvme_nsid_valid(n, nsid)) { 4966 return NVME_INVALID_NSID | NVME_DNR; 4967 } 4968 4969 ns = nvme_subsys_ns(n->subsys, nsid); 4970 if (!ns) { 4971 return NVME_INVALID_FIELD | NVME_DNR; 4972 } 4973 4974 ret = nvme_h2c(n, (uint8_t *)list, 4096, req); 4975 if (ret) { 4976 return ret; 4977 } 4978 4979 if (!*nr_ids) { 4980 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR; 4981 } 4982 4983 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1); 4984 for (i = 0; i < *nr_ids; i++) { 4985 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]); 4986 if (!ctrl) { 4987 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR; 4988 } 4989 4990 if (attach) { 4991 if (nvme_ns(ctrl, nsid)) { 4992 return NVME_NS_ALREADY_ATTACHED | NVME_DNR; 4993 } 4994 4995 if (ns->attached && !ns->params.shared) { 4996 return NVME_NS_PRIVATE | NVME_DNR; 4997 } 4998 4999 nvme_attach_ns(ctrl, ns); 5000 nvme_select_iocs_ns(ctrl, ns); 5001 } else { 5002 if (!nvme_ns(ctrl, nsid)) { 5003 return NVME_NS_NOT_ATTACHED | NVME_DNR; 5004 } 5005 5006 ctrl->namespaces[nsid] = NULL; 5007 ns->attached--; 5008 5009 nvme_update_dmrsl(ctrl); 5010 } 5011 5012 /* 5013 * Add namespace id to the changed namespace id list for event clearing 5014 * via Get Log Page command. 5015 */ 5016 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) { 5017 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE, 5018 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED, 5019 NVME_LOG_CHANGED_NSLIST); 5020 } 5021 } 5022 5023 return NVME_SUCCESS; 5024 } 5025 5026 static uint16_t nvme_format_ns(NvmeCtrl *n, NvmeNamespace *ns, uint8_t lbaf, 5027 uint8_t mset, uint8_t pi, uint8_t pil, 5028 NvmeRequest *req) 5029 { 5030 int64_t len, offset; 5031 struct nvme_aio_format_ctx *ctx; 5032 BlockBackend *blk = ns->blkconf.blk; 5033 uint16_t ms; 5034 uintptr_t *num_formats = (uintptr_t *)&req->opaque; 5035 int *count; 5036 5037 if (ns->params.zoned) { 5038 return NVME_INVALID_FORMAT | NVME_DNR; 5039 } 5040 5041 trace_pci_nvme_format_ns(nvme_cid(req), nvme_nsid(ns), lbaf, mset, pi, pil); 5042 5043 if (lbaf > ns->id_ns.nlbaf) { 5044 return NVME_INVALID_FORMAT | NVME_DNR; 5045 } 5046 5047 ms = ns->id_ns.lbaf[lbaf].ms; 5048 5049 if (pi && (ms < sizeof(NvmeDifTuple))) { 5050 return NVME_INVALID_FORMAT | NVME_DNR; 5051 } 5052 5053 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) { 5054 return NVME_INVALID_FIELD | NVME_DNR; 5055 } 5056 5057 nvme_ns_drain(ns); 5058 nvme_ns_shutdown(ns); 5059 nvme_ns_cleanup(ns); 5060 5061 ns->id_ns.dps = (pil << 3) | pi; 5062 ns->id_ns.flbas = lbaf | (mset << 4); 5063 5064 nvme_ns_init_format(ns); 5065 5066 ns->status = NVME_FORMAT_IN_PROGRESS; 5067 5068 len = ns->size; 5069 offset = 0; 5070 5071 count = g_new(int, 1); 5072 *count = 1; 5073 5074 (*num_formats)++; 5075 5076 while (len) { 5077 ctx = g_new(struct nvme_aio_format_ctx, 1); 5078 ctx->req = req; 5079 ctx->ns = ns; 5080 ctx->count = count; 5081 5082 size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len); 5083 5084 (*count)++; 5085 5086 blk_aio_pwrite_zeroes(blk, offset, bytes, BDRV_REQ_MAY_UNMAP, 5087 nvme_aio_format_cb, ctx); 5088 5089 offset += bytes; 5090 len -= bytes; 5091 5092 } 5093 5094 if (--(*count)) { 5095 return NVME_NO_COMPLETE; 5096 } 5097 5098 g_free(count); 5099 ns->status = 0x0; 5100 (*num_formats)--; 5101 5102 return NVME_SUCCESS; 5103 } 5104 5105 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req) 5106 { 5107 NvmeNamespace *ns; 5108 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 5109 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 5110 uint8_t lbaf = dw10 & 0xf; 5111 uint8_t mset = (dw10 >> 4) & 0x1; 5112 uint8_t pi = (dw10 >> 5) & 0x7; 5113 uint8_t pil = (dw10 >> 8) & 0x1; 5114 uintptr_t *num_formats = (uintptr_t *)&req->opaque; 5115 uint16_t status; 5116 int i; 5117 5118 trace_pci_nvme_format(nvme_cid(req), nsid, lbaf, mset, pi, pil); 5119 5120 /* 1-initialize; see the comment in nvme_dsm */ 5121 *num_formats = 1; 5122 5123 if (nsid != NVME_NSID_BROADCAST) { 5124 if (!nvme_nsid_valid(n, nsid)) { 5125 return NVME_INVALID_NSID | NVME_DNR; 5126 } 5127 5128 ns = nvme_ns(n, nsid); 5129 if (!ns) { 5130 return NVME_INVALID_FIELD | NVME_DNR; 5131 } 5132 5133 status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req); 5134 if (status && status != NVME_NO_COMPLETE) { 5135 req->status = status; 5136 } 5137 } else { 5138 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5139 ns = nvme_ns(n, i); 5140 if (!ns) { 5141 continue; 5142 } 5143 5144 status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req); 5145 if (status && status != NVME_NO_COMPLETE) { 5146 req->status = status; 5147 break; 5148 } 5149 } 5150 } 5151 5152 /* account for the 1-initialization */ 5153 if (--(*num_formats)) { 5154 return NVME_NO_COMPLETE; 5155 } 5156 5157 return req->status; 5158 } 5159 5160 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) 5161 { 5162 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, 5163 nvme_adm_opc_str(req->cmd.opcode)); 5164 5165 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { 5166 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode); 5167 return NVME_INVALID_OPCODE | NVME_DNR; 5168 } 5169 5170 /* SGLs shall not be used for Admin commands in NVMe over PCIe */ 5171 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) { 5172 return NVME_INVALID_FIELD | NVME_DNR; 5173 } 5174 5175 switch (req->cmd.opcode) { 5176 case NVME_ADM_CMD_DELETE_SQ: 5177 return nvme_del_sq(n, req); 5178 case NVME_ADM_CMD_CREATE_SQ: 5179 return nvme_create_sq(n, req); 5180 case NVME_ADM_CMD_GET_LOG_PAGE: 5181 return nvme_get_log(n, req); 5182 case NVME_ADM_CMD_DELETE_CQ: 5183 return nvme_del_cq(n, req); 5184 case NVME_ADM_CMD_CREATE_CQ: 5185 return nvme_create_cq(n, req); 5186 case NVME_ADM_CMD_IDENTIFY: 5187 return nvme_identify(n, req); 5188 case NVME_ADM_CMD_ABORT: 5189 return nvme_abort(n, req); 5190 case NVME_ADM_CMD_SET_FEATURES: 5191 return nvme_set_feature(n, req); 5192 case NVME_ADM_CMD_GET_FEATURES: 5193 return nvme_get_feature(n, req); 5194 case NVME_ADM_CMD_ASYNC_EV_REQ: 5195 return nvme_aer(n, req); 5196 case NVME_ADM_CMD_NS_ATTACHMENT: 5197 return nvme_ns_attachment(n, req); 5198 case NVME_ADM_CMD_FORMAT_NVM: 5199 return nvme_format(n, req); 5200 default: 5201 assert(false); 5202 } 5203 5204 return NVME_INVALID_OPCODE | NVME_DNR; 5205 } 5206 5207 static void nvme_process_sq(void *opaque) 5208 { 5209 NvmeSQueue *sq = opaque; 5210 NvmeCtrl *n = sq->ctrl; 5211 NvmeCQueue *cq = n->cq[sq->cqid]; 5212 5213 uint16_t status; 5214 hwaddr addr; 5215 NvmeCmd cmd; 5216 NvmeRequest *req; 5217 5218 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) { 5219 addr = sq->dma_addr + sq->head * n->sqe_size; 5220 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) { 5221 trace_pci_nvme_err_addr_read(addr); 5222 trace_pci_nvme_err_cfs(); 5223 n->bar.csts = NVME_CSTS_FAILED; 5224 break; 5225 } 5226 nvme_inc_sq_head(sq); 5227 5228 req = QTAILQ_FIRST(&sq->req_list); 5229 QTAILQ_REMOVE(&sq->req_list, req, entry); 5230 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry); 5231 nvme_req_clear(req); 5232 req->cqe.cid = cmd.cid; 5233 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd)); 5234 5235 status = sq->sqid ? nvme_io_cmd(n, req) : 5236 nvme_admin_cmd(n, req); 5237 if (status != NVME_NO_COMPLETE) { 5238 req->status = status; 5239 nvme_enqueue_req_completion(cq, req); 5240 } 5241 } 5242 } 5243 5244 static void nvme_ctrl_reset(NvmeCtrl *n) 5245 { 5246 NvmeNamespace *ns; 5247 int i; 5248 5249 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5250 ns = nvme_ns(n, i); 5251 if (!ns) { 5252 continue; 5253 } 5254 5255 nvme_ns_drain(ns); 5256 } 5257 5258 for (i = 0; i < n->params.max_ioqpairs + 1; i++) { 5259 if (n->sq[i] != NULL) { 5260 nvme_free_sq(n->sq[i], n); 5261 } 5262 } 5263 for (i = 0; i < n->params.max_ioqpairs + 1; i++) { 5264 if (n->cq[i] != NULL) { 5265 nvme_free_cq(n->cq[i], n); 5266 } 5267 } 5268 5269 while (!QTAILQ_EMPTY(&n->aer_queue)) { 5270 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue); 5271 QTAILQ_REMOVE(&n->aer_queue, event, entry); 5272 g_free(event); 5273 } 5274 5275 n->aer_queued = 0; 5276 n->outstanding_aers = 0; 5277 n->qs_created = false; 5278 5279 n->bar.cc = 0; 5280 } 5281 5282 static void nvme_ctrl_shutdown(NvmeCtrl *n) 5283 { 5284 NvmeNamespace *ns; 5285 int i; 5286 5287 if (n->pmr.dev) { 5288 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size); 5289 } 5290 5291 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5292 ns = nvme_ns(n, i); 5293 if (!ns) { 5294 continue; 5295 } 5296 5297 nvme_ns_shutdown(ns); 5298 } 5299 } 5300 5301 static void nvme_select_iocs(NvmeCtrl *n) 5302 { 5303 NvmeNamespace *ns; 5304 int i; 5305 5306 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5307 ns = nvme_ns(n, i); 5308 if (!ns) { 5309 continue; 5310 } 5311 5312 nvme_select_iocs_ns(n, ns); 5313 } 5314 } 5315 5316 static int nvme_start_ctrl(NvmeCtrl *n) 5317 { 5318 uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12; 5319 uint32_t page_size = 1 << page_bits; 5320 5321 if (unlikely(n->cq[0])) { 5322 trace_pci_nvme_err_startfail_cq(); 5323 return -1; 5324 } 5325 if (unlikely(n->sq[0])) { 5326 trace_pci_nvme_err_startfail_sq(); 5327 return -1; 5328 } 5329 if (unlikely(!n->bar.asq)) { 5330 trace_pci_nvme_err_startfail_nbarasq(); 5331 return -1; 5332 } 5333 if (unlikely(!n->bar.acq)) { 5334 trace_pci_nvme_err_startfail_nbaracq(); 5335 return -1; 5336 } 5337 if (unlikely(n->bar.asq & (page_size - 1))) { 5338 trace_pci_nvme_err_startfail_asq_misaligned(n->bar.asq); 5339 return -1; 5340 } 5341 if (unlikely(n->bar.acq & (page_size - 1))) { 5342 trace_pci_nvme_err_startfail_acq_misaligned(n->bar.acq); 5343 return -1; 5344 } 5345 if (unlikely(!(NVME_CAP_CSS(n->bar.cap) & (1 << NVME_CC_CSS(n->bar.cc))))) { 5346 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(n->bar.cc)); 5347 return -1; 5348 } 5349 if (unlikely(NVME_CC_MPS(n->bar.cc) < 5350 NVME_CAP_MPSMIN(n->bar.cap))) { 5351 trace_pci_nvme_err_startfail_page_too_small( 5352 NVME_CC_MPS(n->bar.cc), 5353 NVME_CAP_MPSMIN(n->bar.cap)); 5354 return -1; 5355 } 5356 if (unlikely(NVME_CC_MPS(n->bar.cc) > 5357 NVME_CAP_MPSMAX(n->bar.cap))) { 5358 trace_pci_nvme_err_startfail_page_too_large( 5359 NVME_CC_MPS(n->bar.cc), 5360 NVME_CAP_MPSMAX(n->bar.cap)); 5361 return -1; 5362 } 5363 if (unlikely(NVME_CC_IOCQES(n->bar.cc) < 5364 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) { 5365 trace_pci_nvme_err_startfail_cqent_too_small( 5366 NVME_CC_IOCQES(n->bar.cc), 5367 NVME_CTRL_CQES_MIN(n->bar.cap)); 5368 return -1; 5369 } 5370 if (unlikely(NVME_CC_IOCQES(n->bar.cc) > 5371 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) { 5372 trace_pci_nvme_err_startfail_cqent_too_large( 5373 NVME_CC_IOCQES(n->bar.cc), 5374 NVME_CTRL_CQES_MAX(n->bar.cap)); 5375 return -1; 5376 } 5377 if (unlikely(NVME_CC_IOSQES(n->bar.cc) < 5378 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) { 5379 trace_pci_nvme_err_startfail_sqent_too_small( 5380 NVME_CC_IOSQES(n->bar.cc), 5381 NVME_CTRL_SQES_MIN(n->bar.cap)); 5382 return -1; 5383 } 5384 if (unlikely(NVME_CC_IOSQES(n->bar.cc) > 5385 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) { 5386 trace_pci_nvme_err_startfail_sqent_too_large( 5387 NVME_CC_IOSQES(n->bar.cc), 5388 NVME_CTRL_SQES_MAX(n->bar.cap)); 5389 return -1; 5390 } 5391 if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) { 5392 trace_pci_nvme_err_startfail_asqent_sz_zero(); 5393 return -1; 5394 } 5395 if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) { 5396 trace_pci_nvme_err_startfail_acqent_sz_zero(); 5397 return -1; 5398 } 5399 5400 n->page_bits = page_bits; 5401 n->page_size = page_size; 5402 n->max_prp_ents = n->page_size / sizeof(uint64_t); 5403 n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc); 5404 n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc); 5405 nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0, 5406 NVME_AQA_ACQS(n->bar.aqa) + 1, 1); 5407 nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0, 5408 NVME_AQA_ASQS(n->bar.aqa) + 1); 5409 5410 nvme_set_timestamp(n, 0ULL); 5411 5412 QTAILQ_INIT(&n->aer_queue); 5413 5414 nvme_select_iocs(n); 5415 5416 return 0; 5417 } 5418 5419 static void nvme_cmb_enable_regs(NvmeCtrl *n) 5420 { 5421 NVME_CMBLOC_SET_CDPCILS(n->bar.cmbloc, 1); 5422 NVME_CMBLOC_SET_CDPMLS(n->bar.cmbloc, 1); 5423 NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR); 5424 5425 NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1); 5426 NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0); 5427 NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1); 5428 NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1); 5429 NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1); 5430 NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */ 5431 NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb); 5432 } 5433 5434 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, 5435 unsigned size) 5436 { 5437 if (unlikely(offset & (sizeof(uint32_t) - 1))) { 5438 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32, 5439 "MMIO write not 32-bit aligned," 5440 " offset=0x%"PRIx64"", offset); 5441 /* should be ignored, fall through for now */ 5442 } 5443 5444 if (unlikely(size < sizeof(uint32_t))) { 5445 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall, 5446 "MMIO write smaller than 32-bits," 5447 " offset=0x%"PRIx64", size=%u", 5448 offset, size); 5449 /* should be ignored, fall through for now */ 5450 } 5451 5452 switch (offset) { 5453 case 0xc: /* INTMS */ 5454 if (unlikely(msix_enabled(&(n->parent_obj)))) { 5455 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix, 5456 "undefined access to interrupt mask set" 5457 " when MSI-X is enabled"); 5458 /* should be ignored, fall through for now */ 5459 } 5460 n->bar.intms |= data & 0xffffffff; 5461 n->bar.intmc = n->bar.intms; 5462 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, n->bar.intmc); 5463 nvme_irq_check(n); 5464 break; 5465 case 0x10: /* INTMC */ 5466 if (unlikely(msix_enabled(&(n->parent_obj)))) { 5467 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix, 5468 "undefined access to interrupt mask clr" 5469 " when MSI-X is enabled"); 5470 /* should be ignored, fall through for now */ 5471 } 5472 n->bar.intms &= ~(data & 0xffffffff); 5473 n->bar.intmc = n->bar.intms; 5474 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, n->bar.intmc); 5475 nvme_irq_check(n); 5476 break; 5477 case 0x14: /* CC */ 5478 trace_pci_nvme_mmio_cfg(data & 0xffffffff); 5479 /* Windows first sends data, then sends enable bit */ 5480 if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) && 5481 !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc)) 5482 { 5483 n->bar.cc = data; 5484 } 5485 5486 if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) { 5487 n->bar.cc = data; 5488 if (unlikely(nvme_start_ctrl(n))) { 5489 trace_pci_nvme_err_startfail(); 5490 n->bar.csts = NVME_CSTS_FAILED; 5491 } else { 5492 trace_pci_nvme_mmio_start_success(); 5493 n->bar.csts = NVME_CSTS_READY; 5494 } 5495 } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) { 5496 trace_pci_nvme_mmio_stopped(); 5497 nvme_ctrl_reset(n); 5498 n->bar.csts &= ~NVME_CSTS_READY; 5499 } 5500 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) { 5501 trace_pci_nvme_mmio_shutdown_set(); 5502 nvme_ctrl_shutdown(n); 5503 n->bar.cc = data; 5504 n->bar.csts |= NVME_CSTS_SHST_COMPLETE; 5505 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) { 5506 trace_pci_nvme_mmio_shutdown_cleared(); 5507 n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE; 5508 n->bar.cc = data; 5509 } 5510 break; 5511 case 0x1c: /* CSTS */ 5512 if (data & (1 << 4)) { 5513 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported, 5514 "attempted to W1C CSTS.NSSRO" 5515 " but CAP.NSSRS is zero (not supported)"); 5516 } else if (data != 0) { 5517 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts, 5518 "attempted to set a read only bit" 5519 " of controller status"); 5520 } 5521 break; 5522 case 0x20: /* NSSR */ 5523 if (data == 0x4e564d65) { 5524 trace_pci_nvme_ub_mmiowr_ssreset_unsupported(); 5525 } else { 5526 /* The spec says that writes of other values have no effect */ 5527 return; 5528 } 5529 break; 5530 case 0x24: /* AQA */ 5531 n->bar.aqa = data & 0xffffffff; 5532 trace_pci_nvme_mmio_aqattr(data & 0xffffffff); 5533 break; 5534 case 0x28: /* ASQ */ 5535 n->bar.asq = size == 8 ? data : 5536 (n->bar.asq & ~0xffffffffULL) | (data & 0xffffffff); 5537 trace_pci_nvme_mmio_asqaddr(data); 5538 break; 5539 case 0x2c: /* ASQ hi */ 5540 n->bar.asq = (n->bar.asq & 0xffffffff) | (data << 32); 5541 trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq); 5542 break; 5543 case 0x30: /* ACQ */ 5544 trace_pci_nvme_mmio_acqaddr(data); 5545 n->bar.acq = size == 8 ? data : 5546 (n->bar.acq & ~0xffffffffULL) | (data & 0xffffffff); 5547 break; 5548 case 0x34: /* ACQ hi */ 5549 n->bar.acq = (n->bar.acq & 0xffffffff) | (data << 32); 5550 trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq); 5551 break; 5552 case 0x38: /* CMBLOC */ 5553 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved, 5554 "invalid write to reserved CMBLOC" 5555 " when CMBSZ is zero, ignored"); 5556 return; 5557 case 0x3C: /* CMBSZ */ 5558 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly, 5559 "invalid write to read only CMBSZ, ignored"); 5560 return; 5561 case 0x50: /* CMBMSC */ 5562 if (!NVME_CAP_CMBS(n->bar.cap)) { 5563 return; 5564 } 5565 5566 n->bar.cmbmsc = size == 8 ? data : 5567 (n->bar.cmbmsc & ~0xffffffff) | (data & 0xffffffff); 5568 n->cmb.cmse = false; 5569 5570 if (NVME_CMBMSC_CRE(data)) { 5571 nvme_cmb_enable_regs(n); 5572 5573 if (NVME_CMBMSC_CMSE(data)) { 5574 hwaddr cba = NVME_CMBMSC_CBA(data) << CMBMSC_CBA_SHIFT; 5575 if (cba + int128_get64(n->cmb.mem.size) < cba) { 5576 NVME_CMBSTS_SET_CBAI(n->bar.cmbsts, 1); 5577 return; 5578 } 5579 5580 n->cmb.cba = cba; 5581 n->cmb.cmse = true; 5582 } 5583 } else { 5584 n->bar.cmbsz = 0; 5585 n->bar.cmbloc = 0; 5586 } 5587 5588 return; 5589 case 0x54: /* CMBMSC hi */ 5590 n->bar.cmbmsc = (n->bar.cmbmsc & 0xffffffff) | (data << 32); 5591 return; 5592 5593 case 0xe00: /* PMRCAP */ 5594 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly, 5595 "invalid write to PMRCAP register, ignored"); 5596 return; 5597 case 0xe04: /* PMRCTL */ 5598 n->bar.pmrctl = data; 5599 if (NVME_PMRCTL_EN(data)) { 5600 memory_region_set_enabled(&n->pmr.dev->mr, true); 5601 n->bar.pmrsts = 0; 5602 } else { 5603 memory_region_set_enabled(&n->pmr.dev->mr, false); 5604 NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1); 5605 n->pmr.cmse = false; 5606 } 5607 return; 5608 case 0xe08: /* PMRSTS */ 5609 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly, 5610 "invalid write to PMRSTS register, ignored"); 5611 return; 5612 case 0xe0C: /* PMREBS */ 5613 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly, 5614 "invalid write to PMREBS register, ignored"); 5615 return; 5616 case 0xe10: /* PMRSWTP */ 5617 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly, 5618 "invalid write to PMRSWTP register, ignored"); 5619 return; 5620 case 0xe14: /* PMRMSCL */ 5621 if (!NVME_CAP_PMRS(n->bar.cap)) { 5622 return; 5623 } 5624 5625 n->bar.pmrmsc = (n->bar.pmrmsc & ~0xffffffff) | (data & 0xffffffff); 5626 n->pmr.cmse = false; 5627 5628 if (NVME_PMRMSC_CMSE(n->bar.pmrmsc)) { 5629 hwaddr cba = NVME_PMRMSC_CBA(n->bar.pmrmsc) << PMRMSC_CBA_SHIFT; 5630 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) { 5631 NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 1); 5632 return; 5633 } 5634 5635 n->pmr.cmse = true; 5636 n->pmr.cba = cba; 5637 } 5638 5639 return; 5640 case 0xe18: /* PMRMSCU */ 5641 if (!NVME_CAP_PMRS(n->bar.cap)) { 5642 return; 5643 } 5644 5645 n->bar.pmrmsc = (n->bar.pmrmsc & 0xffffffff) | (data << 32); 5646 return; 5647 default: 5648 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid, 5649 "invalid MMIO write," 5650 " offset=0x%"PRIx64", data=%"PRIx64"", 5651 offset, data); 5652 break; 5653 } 5654 } 5655 5656 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) 5657 { 5658 NvmeCtrl *n = (NvmeCtrl *)opaque; 5659 uint8_t *ptr = (uint8_t *)&n->bar; 5660 uint64_t val = 0; 5661 5662 trace_pci_nvme_mmio_read(addr, size); 5663 5664 if (unlikely(addr & (sizeof(uint32_t) - 1))) { 5665 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32, 5666 "MMIO read not 32-bit aligned," 5667 " offset=0x%"PRIx64"", addr); 5668 /* should RAZ, fall through for now */ 5669 } else if (unlikely(size < sizeof(uint32_t))) { 5670 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall, 5671 "MMIO read smaller than 32-bits," 5672 " offset=0x%"PRIx64"", addr); 5673 /* should RAZ, fall through for now */ 5674 } 5675 5676 if (addr < sizeof(n->bar)) { 5677 /* 5678 * When PMRWBM bit 1 is set then read from 5679 * from PMRSTS should ensure prior writes 5680 * made it to persistent media 5681 */ 5682 if (addr == 0xe08 && 5683 (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) { 5684 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size); 5685 } 5686 memcpy(&val, ptr + addr, size); 5687 } else { 5688 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs, 5689 "MMIO read beyond last register," 5690 " offset=0x%"PRIx64", returning 0", addr); 5691 } 5692 5693 return val; 5694 } 5695 5696 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) 5697 { 5698 uint32_t qid; 5699 5700 if (unlikely(addr & ((1 << 2) - 1))) { 5701 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned, 5702 "doorbell write not 32-bit aligned," 5703 " offset=0x%"PRIx64", ignoring", addr); 5704 return; 5705 } 5706 5707 if (((addr - 0x1000) >> 2) & 1) { 5708 /* Completion queue doorbell write */ 5709 5710 uint16_t new_head = val & 0xffff; 5711 int start_sqs; 5712 NvmeCQueue *cq; 5713 5714 qid = (addr - (0x1000 + (1 << 2))) >> 3; 5715 if (unlikely(nvme_check_cqid(n, qid))) { 5716 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq, 5717 "completion queue doorbell write" 5718 " for nonexistent queue," 5719 " sqid=%"PRIu32", ignoring", qid); 5720 5721 /* 5722 * NVM Express v1.3d, Section 4.1 state: "If host software writes 5723 * an invalid value to the Submission Queue Tail Doorbell or 5724 * Completion Queue Head Doorbell regiter and an Asynchronous Event 5725 * Request command is outstanding, then an asynchronous event is 5726 * posted to the Admin Completion Queue with a status code of 5727 * Invalid Doorbell Write Value." 5728 * 5729 * Also note that the spec includes the "Invalid Doorbell Register" 5730 * status code, but nowhere does it specify when to use it. 5731 * However, it seems reasonable to use it here in a similar 5732 * fashion. 5733 */ 5734 if (n->outstanding_aers) { 5735 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 5736 NVME_AER_INFO_ERR_INVALID_DB_REGISTER, 5737 NVME_LOG_ERROR_INFO); 5738 } 5739 5740 return; 5741 } 5742 5743 cq = n->cq[qid]; 5744 if (unlikely(new_head >= cq->size)) { 5745 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead, 5746 "completion queue doorbell write value" 5747 " beyond queue size, sqid=%"PRIu32"," 5748 " new_head=%"PRIu16", ignoring", 5749 qid, new_head); 5750 5751 if (n->outstanding_aers) { 5752 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 5753 NVME_AER_INFO_ERR_INVALID_DB_VALUE, 5754 NVME_LOG_ERROR_INFO); 5755 } 5756 5757 return; 5758 } 5759 5760 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head); 5761 5762 start_sqs = nvme_cq_full(cq) ? 1 : 0; 5763 cq->head = new_head; 5764 if (start_sqs) { 5765 NvmeSQueue *sq; 5766 QTAILQ_FOREACH(sq, &cq->sq_list, entry) { 5767 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); 5768 } 5769 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); 5770 } 5771 5772 if (cq->tail == cq->head) { 5773 nvme_irq_deassert(n, cq); 5774 } 5775 } else { 5776 /* Submission queue doorbell write */ 5777 5778 uint16_t new_tail = val & 0xffff; 5779 NvmeSQueue *sq; 5780 5781 qid = (addr - 0x1000) >> 3; 5782 if (unlikely(nvme_check_sqid(n, qid))) { 5783 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq, 5784 "submission queue doorbell write" 5785 " for nonexistent queue," 5786 " sqid=%"PRIu32", ignoring", qid); 5787 5788 if (n->outstanding_aers) { 5789 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 5790 NVME_AER_INFO_ERR_INVALID_DB_REGISTER, 5791 NVME_LOG_ERROR_INFO); 5792 } 5793 5794 return; 5795 } 5796 5797 sq = n->sq[qid]; 5798 if (unlikely(new_tail >= sq->size)) { 5799 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail, 5800 "submission queue doorbell write value" 5801 " beyond queue size, sqid=%"PRIu32"," 5802 " new_tail=%"PRIu16", ignoring", 5803 qid, new_tail); 5804 5805 if (n->outstanding_aers) { 5806 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 5807 NVME_AER_INFO_ERR_INVALID_DB_VALUE, 5808 NVME_LOG_ERROR_INFO); 5809 } 5810 5811 return; 5812 } 5813 5814 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail); 5815 5816 sq->tail = new_tail; 5817 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); 5818 } 5819 } 5820 5821 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data, 5822 unsigned size) 5823 { 5824 NvmeCtrl *n = (NvmeCtrl *)opaque; 5825 5826 trace_pci_nvme_mmio_write(addr, data, size); 5827 5828 if (addr < sizeof(n->bar)) { 5829 nvme_write_bar(n, addr, data, size); 5830 } else { 5831 nvme_process_db(n, addr, data); 5832 } 5833 } 5834 5835 static const MemoryRegionOps nvme_mmio_ops = { 5836 .read = nvme_mmio_read, 5837 .write = nvme_mmio_write, 5838 .endianness = DEVICE_LITTLE_ENDIAN, 5839 .impl = { 5840 .min_access_size = 2, 5841 .max_access_size = 8, 5842 }, 5843 }; 5844 5845 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data, 5846 unsigned size) 5847 { 5848 NvmeCtrl *n = (NvmeCtrl *)opaque; 5849 stn_le_p(&n->cmb.buf[addr], size, data); 5850 } 5851 5852 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size) 5853 { 5854 NvmeCtrl *n = (NvmeCtrl *)opaque; 5855 return ldn_le_p(&n->cmb.buf[addr], size); 5856 } 5857 5858 static const MemoryRegionOps nvme_cmb_ops = { 5859 .read = nvme_cmb_read, 5860 .write = nvme_cmb_write, 5861 .endianness = DEVICE_LITTLE_ENDIAN, 5862 .impl = { 5863 .min_access_size = 1, 5864 .max_access_size = 8, 5865 }, 5866 }; 5867 5868 static void nvme_check_constraints(NvmeCtrl *n, Error **errp) 5869 { 5870 NvmeParams *params = &n->params; 5871 5872 if (params->num_queues) { 5873 warn_report("num_queues is deprecated; please use max_ioqpairs " 5874 "instead"); 5875 5876 params->max_ioqpairs = params->num_queues - 1; 5877 } 5878 5879 if (n->namespace.blkconf.blk && n->subsys) { 5880 error_setg(errp, "subsystem support is unavailable with legacy " 5881 "namespace ('drive' property)"); 5882 return; 5883 } 5884 5885 if (params->max_ioqpairs < 1 || 5886 params->max_ioqpairs > NVME_MAX_IOQPAIRS) { 5887 error_setg(errp, "max_ioqpairs must be between 1 and %d", 5888 NVME_MAX_IOQPAIRS); 5889 return; 5890 } 5891 5892 if (params->msix_qsize < 1 || 5893 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) { 5894 error_setg(errp, "msix_qsize must be between 1 and %d", 5895 PCI_MSIX_FLAGS_QSIZE + 1); 5896 return; 5897 } 5898 5899 if (!params->serial) { 5900 error_setg(errp, "serial property not set"); 5901 return; 5902 } 5903 5904 if (n->pmr.dev) { 5905 if (host_memory_backend_is_mapped(n->pmr.dev)) { 5906 error_setg(errp, "can't use already busy memdev: %s", 5907 object_get_canonical_path_component(OBJECT(n->pmr.dev))); 5908 return; 5909 } 5910 5911 if (!is_power_of_2(n->pmr.dev->size)) { 5912 error_setg(errp, "pmr backend size needs to be power of 2 in size"); 5913 return; 5914 } 5915 5916 host_memory_backend_set_mapped(n->pmr.dev, true); 5917 } 5918 5919 if (n->params.zasl > n->params.mdts) { 5920 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less " 5921 "than or equal to mdts (Maximum Data Transfer Size)"); 5922 return; 5923 } 5924 5925 if (!n->params.vsl) { 5926 error_setg(errp, "vsl must be non-zero"); 5927 return; 5928 } 5929 } 5930 5931 static void nvme_init_state(NvmeCtrl *n) 5932 { 5933 /* add one to max_ioqpairs to account for the admin queue pair */ 5934 n->reg_size = pow2ceil(sizeof(NvmeBar) + 5935 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE); 5936 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1); 5937 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1); 5938 n->temperature = NVME_TEMPERATURE; 5939 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING; 5940 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 5941 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1); 5942 } 5943 5944 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) 5945 { 5946 uint64_t cmb_size = n->params.cmb_size_mb * MiB; 5947 5948 n->cmb.buf = g_malloc0(cmb_size); 5949 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n, 5950 "nvme-cmb", cmb_size); 5951 pci_register_bar(pci_dev, NVME_CMB_BIR, 5952 PCI_BASE_ADDRESS_SPACE_MEMORY | 5953 PCI_BASE_ADDRESS_MEM_TYPE_64 | 5954 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem); 5955 5956 NVME_CAP_SET_CMBS(n->bar.cap, 1); 5957 5958 if (n->params.legacy_cmb) { 5959 nvme_cmb_enable_regs(n); 5960 n->cmb.cmse = true; 5961 } 5962 } 5963 5964 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) 5965 { 5966 NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 1); 5967 NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 1); 5968 NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR); 5969 /* Turn on bit 1 support */ 5970 NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02); 5971 NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 1); 5972 5973 pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap), 5974 PCI_BASE_ADDRESS_SPACE_MEMORY | 5975 PCI_BASE_ADDRESS_MEM_TYPE_64 | 5976 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr); 5977 5978 memory_region_set_enabled(&n->pmr.dev->mr, false); 5979 } 5980 5981 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) 5982 { 5983 uint8_t *pci_conf = pci_dev->config; 5984 uint64_t bar_size, msix_table_size, msix_pba_size; 5985 unsigned msix_table_offset, msix_pba_offset; 5986 int ret; 5987 5988 Error *err = NULL; 5989 5990 pci_conf[PCI_INTERRUPT_PIN] = 1; 5991 pci_config_set_prog_interface(pci_conf, 0x2); 5992 5993 if (n->params.use_intel_id) { 5994 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL); 5995 pci_config_set_device_id(pci_conf, 0x5845); 5996 } else { 5997 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT); 5998 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME); 5999 } 6000 6001 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS); 6002 pcie_endpoint_cap_init(pci_dev, 0x80); 6003 6004 bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB); 6005 msix_table_offset = bar_size; 6006 msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize; 6007 6008 bar_size += msix_table_size; 6009 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB); 6010 msix_pba_offset = bar_size; 6011 msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8; 6012 6013 bar_size += msix_pba_size; 6014 bar_size = pow2ceil(bar_size); 6015 6016 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size); 6017 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme", 6018 n->reg_size); 6019 memory_region_add_subregion(&n->bar0, 0, &n->iomem); 6020 6021 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | 6022 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0); 6023 ret = msix_init(pci_dev, n->params.msix_qsize, 6024 &n->bar0, 0, msix_table_offset, 6025 &n->bar0, 0, msix_pba_offset, 0, &err); 6026 if (ret < 0) { 6027 if (ret == -ENOTSUP) { 6028 warn_report_err(err); 6029 } else { 6030 error_propagate(errp, err); 6031 return ret; 6032 } 6033 } 6034 6035 if (n->params.cmb_size_mb) { 6036 nvme_init_cmb(n, pci_dev); 6037 } 6038 6039 if (n->pmr.dev) { 6040 nvme_init_pmr(n, pci_dev); 6041 } 6042 6043 return 0; 6044 } 6045 6046 static void nvme_init_subnqn(NvmeCtrl *n) 6047 { 6048 NvmeSubsystem *subsys = n->subsys; 6049 NvmeIdCtrl *id = &n->id_ctrl; 6050 6051 if (!subsys) { 6052 snprintf((char *)id->subnqn, sizeof(id->subnqn), 6053 "nqn.2019-08.org.qemu:%s", n->params.serial); 6054 } else { 6055 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn); 6056 } 6057 } 6058 6059 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) 6060 { 6061 NvmeIdCtrl *id = &n->id_ctrl; 6062 uint8_t *pci_conf = pci_dev->config; 6063 6064 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID)); 6065 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID)); 6066 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' '); 6067 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' '); 6068 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' '); 6069 6070 id->cntlid = cpu_to_le16(n->cntlid); 6071 6072 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR); 6073 6074 id->rab = 6; 6075 6076 if (n->params.use_intel_id) { 6077 id->ieee[0] = 0xb3; 6078 id->ieee[1] = 0x02; 6079 id->ieee[2] = 0x00; 6080 } else { 6081 id->ieee[0] = 0x00; 6082 id->ieee[1] = 0x54; 6083 id->ieee[2] = 0x52; 6084 } 6085 6086 id->mdts = n->params.mdts; 6087 id->ver = cpu_to_le32(NVME_SPEC_VER); 6088 id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT); 6089 id->cntrltype = 0x1; 6090 6091 /* 6092 * Because the controller always completes the Abort command immediately, 6093 * there can never be more than one concurrently executing Abort command, 6094 * so this value is never used for anything. Note that there can easily be 6095 * many Abort commands in the queues, but they are not considered 6096 * "executing" until processed by nvme_abort. 6097 * 6098 * The specification recommends a value of 3 for Abort Command Limit (four 6099 * concurrently outstanding Abort commands), so lets use that though it is 6100 * inconsequential. 6101 */ 6102 id->acl = 3; 6103 id->aerl = n->params.aerl; 6104 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO; 6105 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED; 6106 6107 /* recommended default value (~70 C) */ 6108 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING); 6109 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL); 6110 6111 id->sqes = (0x6 << 4) | 0x6; 6112 id->cqes = (0x4 << 4) | 0x4; 6113 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES); 6114 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | 6115 NVME_ONCS_FEATURES | NVME_ONCS_DSM | 6116 NVME_ONCS_COMPARE | NVME_ONCS_COPY); 6117 6118 /* 6119 * NOTE: If this device ever supports a command set that does NOT use 0x0 6120 * as a Flush-equivalent operation, support for the broadcast NSID in Flush 6121 * should probably be removed. 6122 * 6123 * See comment in nvme_io_cmd. 6124 */ 6125 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT; 6126 6127 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0); 6128 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | 6129 NVME_CTRL_SGLS_BITBUCKET); 6130 6131 nvme_init_subnqn(n); 6132 6133 id->psd[0].mp = cpu_to_le16(0x9c4); 6134 id->psd[0].enlat = cpu_to_le32(0x10); 6135 id->psd[0].exlat = cpu_to_le32(0x4); 6136 6137 if (n->subsys) { 6138 id->cmic |= NVME_CMIC_MULTI_CTRL; 6139 } 6140 6141 NVME_CAP_SET_MQES(n->bar.cap, 0x7ff); 6142 NVME_CAP_SET_CQR(n->bar.cap, 1); 6143 NVME_CAP_SET_TO(n->bar.cap, 0xf); 6144 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM); 6145 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP); 6146 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY); 6147 NVME_CAP_SET_MPSMAX(n->bar.cap, 4); 6148 NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0); 6149 NVME_CAP_SET_PMRS(n->bar.cap, n->pmr.dev ? 1 : 0); 6150 6151 n->bar.vs = NVME_SPEC_VER; 6152 n->bar.intmc = n->bar.intms = 0; 6153 } 6154 6155 static int nvme_init_subsys(NvmeCtrl *n, Error **errp) 6156 { 6157 int cntlid; 6158 6159 if (!n->subsys) { 6160 return 0; 6161 } 6162 6163 cntlid = nvme_subsys_register_ctrl(n, errp); 6164 if (cntlid < 0) { 6165 return -1; 6166 } 6167 6168 n->cntlid = cntlid; 6169 6170 return 0; 6171 } 6172 6173 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns) 6174 { 6175 uint32_t nsid = ns->params.nsid; 6176 assert(nsid && nsid <= NVME_MAX_NAMESPACES); 6177 6178 n->namespaces[nsid] = ns; 6179 ns->attached++; 6180 6181 n->dmrsl = MIN_NON_ZERO(n->dmrsl, 6182 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); 6183 } 6184 6185 static void nvme_realize(PCIDevice *pci_dev, Error **errp) 6186 { 6187 NvmeCtrl *n = NVME(pci_dev); 6188 NvmeNamespace *ns; 6189 Error *local_err = NULL; 6190 6191 nvme_check_constraints(n, &local_err); 6192 if (local_err) { 6193 error_propagate(errp, local_err); 6194 return; 6195 } 6196 6197 qbus_create_inplace(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, 6198 &pci_dev->qdev, n->parent_obj.qdev.id); 6199 6200 nvme_init_state(n); 6201 if (nvme_init_pci(n, pci_dev, errp)) { 6202 return; 6203 } 6204 6205 if (nvme_init_subsys(n, errp)) { 6206 error_propagate(errp, local_err); 6207 return; 6208 } 6209 nvme_init_ctrl(n, pci_dev); 6210 6211 /* setup a namespace if the controller drive property was given */ 6212 if (n->namespace.blkconf.blk) { 6213 ns = &n->namespace; 6214 ns->params.nsid = 1; 6215 6216 if (nvme_ns_setup(n, ns, errp)) { 6217 return; 6218 } 6219 6220 nvme_attach_ns(n, ns); 6221 } 6222 } 6223 6224 static void nvme_exit(PCIDevice *pci_dev) 6225 { 6226 NvmeCtrl *n = NVME(pci_dev); 6227 NvmeNamespace *ns; 6228 int i; 6229 6230 nvme_ctrl_reset(n); 6231 6232 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 6233 ns = nvme_ns(n, i); 6234 if (!ns) { 6235 continue; 6236 } 6237 6238 nvme_ns_cleanup(ns); 6239 } 6240 6241 g_free(n->cq); 6242 g_free(n->sq); 6243 g_free(n->aer_reqs); 6244 6245 if (n->params.cmb_size_mb) { 6246 g_free(n->cmb.buf); 6247 } 6248 6249 if (n->pmr.dev) { 6250 host_memory_backend_set_mapped(n->pmr.dev, false); 6251 } 6252 msix_uninit(pci_dev, &n->bar0, &n->bar0); 6253 memory_region_del_subregion(&n->bar0, &n->iomem); 6254 } 6255 6256 static Property nvme_props[] = { 6257 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf), 6258 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND, 6259 HostMemoryBackend *), 6260 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS, 6261 NvmeSubsystem *), 6262 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial), 6263 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0), 6264 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0), 6265 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64), 6266 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65), 6267 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3), 6268 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64), 6269 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7), 6270 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7), 6271 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), 6272 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), 6273 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), 6274 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl, 6275 params.auto_transition_zones, true), 6276 DEFINE_PROP_END_OF_LIST(), 6277 }; 6278 6279 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name, 6280 void *opaque, Error **errp) 6281 { 6282 NvmeCtrl *n = NVME(obj); 6283 uint8_t value = n->smart_critical_warning; 6284 6285 visit_type_uint8(v, name, &value, errp); 6286 } 6287 6288 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name, 6289 void *opaque, Error **errp) 6290 { 6291 NvmeCtrl *n = NVME(obj); 6292 uint8_t value, old_value, cap = 0, index, event; 6293 6294 if (!visit_type_uint8(v, name, &value, errp)) { 6295 return; 6296 } 6297 6298 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY 6299 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA; 6300 if (NVME_CAP_PMRS(n->bar.cap)) { 6301 cap |= NVME_SMART_PMR_UNRELIABLE; 6302 } 6303 6304 if ((value & cap) != value) { 6305 error_setg(errp, "unsupported smart critical warning bits: 0x%x", 6306 value & ~cap); 6307 return; 6308 } 6309 6310 old_value = n->smart_critical_warning; 6311 n->smart_critical_warning = value; 6312 6313 /* only inject new bits of smart critical warning */ 6314 for (index = 0; index < NVME_SMART_WARN_MAX; index++) { 6315 event = 1 << index; 6316 if (value & ~old_value & event) 6317 nvme_smart_event(n, event); 6318 } 6319 } 6320 6321 static const VMStateDescription nvme_vmstate = { 6322 .name = "nvme", 6323 .unmigratable = 1, 6324 }; 6325 6326 static void nvme_class_init(ObjectClass *oc, void *data) 6327 { 6328 DeviceClass *dc = DEVICE_CLASS(oc); 6329 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc); 6330 6331 pc->realize = nvme_realize; 6332 pc->exit = nvme_exit; 6333 pc->class_id = PCI_CLASS_STORAGE_EXPRESS; 6334 pc->revision = 2; 6335 6336 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); 6337 dc->desc = "Non-Volatile Memory Express"; 6338 device_class_set_props(dc, nvme_props); 6339 dc->vmsd = &nvme_vmstate; 6340 } 6341 6342 static void nvme_instance_init(Object *obj) 6343 { 6344 NvmeCtrl *n = NVME(obj); 6345 6346 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex, 6347 "bootindex", "/namespace@1,0", 6348 DEVICE(obj)); 6349 6350 object_property_add(obj, "smart_critical_warning", "uint8", 6351 nvme_get_smart_warning, 6352 nvme_set_smart_warning, NULL, NULL); 6353 } 6354 6355 static const TypeInfo nvme_info = { 6356 .name = TYPE_NVME, 6357 .parent = TYPE_PCI_DEVICE, 6358 .instance_size = sizeof(NvmeCtrl), 6359 .instance_init = nvme_instance_init, 6360 .class_init = nvme_class_init, 6361 .interfaces = (InterfaceInfo[]) { 6362 { INTERFACE_PCIE_DEVICE }, 6363 { } 6364 }, 6365 }; 6366 6367 static const TypeInfo nvme_bus_info = { 6368 .name = TYPE_NVME_BUS, 6369 .parent = TYPE_BUS, 6370 .instance_size = sizeof(NvmeBus), 6371 }; 6372 6373 static void nvme_register_types(void) 6374 { 6375 type_register_static(&nvme_info); 6376 type_register_static(&nvme_bus_info); 6377 } 6378 6379 type_init(nvme_register_types) 6380