1 /* 2 * QEMU NVM Express Controller 3 * 4 * Copyright (c) 2012, Intel Corporation 5 * 6 * Written by Keith Busch <keith.busch@intel.com> 7 * 8 * This code is licensed under the GNU GPL v2 or later. 9 */ 10 11 /** 12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e 13 * 14 * https://nvmexpress.org/developers/nvme-specification/ 15 * 16 * 17 * Notes on coding style 18 * --------------------- 19 * While QEMU coding style prefers lowercase hexadecimals in constants, the 20 * NVMe subsystem use thes format from the NVMe specifications in the comments 21 * (i.e. 'h' suffix instead of '0x' prefix). 22 * 23 * Usage 24 * ----- 25 * See docs/system/nvme.rst for extensive documentation. 26 * 27 * Add options: 28 * -drive file=<file>,if=none,id=<drive_id> 29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id> 30 * -device nvme,serial=<serial>,id=<bus_name>, \ 31 * cmb_size_mb=<cmb_size_mb[optional]>, \ 32 * [pmrdev=<mem_backend_file_id>,] \ 33 * max_ioqpairs=<N[optional]>, \ 34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \ 35 * mdts=<N[optional]>,vsl=<N[optional]>, \ 36 * zoned.zasl=<N[optional]>, \ 37 * subsys=<subsys_id> 38 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\ 39 * zoned=<true|false[optional]>, \ 40 * subsys=<subsys_id>,detached=<true|false[optional]> 41 * 42 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at 43 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the 44 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to 45 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior). 46 * 47 * Enabling pmr emulation can be achieved by pointing to memory-backend-file. 48 * For example: 49 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \ 50 * size=<size> .... -device nvme,...,pmrdev=<mem_id> 51 * 52 * The PMR will use BAR 4/5 exclusively. 53 * 54 * To place controller(s) and namespace(s) to a subsystem, then provide 55 * nvme-subsys device as above. 56 * 57 * nvme subsystem device parameters 58 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 59 * - `nqn` 60 * This parameter provides the `<nqn_id>` part of the string 61 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field 62 * of subsystem controllers. Note that `<nqn_id>` should be unique per 63 * subsystem, but this is not enforced by QEMU. If not specified, it will 64 * default to the value of the `id` parameter (`<subsys_id>`). 65 * 66 * nvme device parameters 67 * ~~~~~~~~~~~~~~~~~~~~~~ 68 * - `subsys` 69 * Specifying this parameter attaches the controller to the subsystem and 70 * the SUBNQN field in the controller will report the NQN of the subsystem 71 * device. This also enables multi controller capability represented in 72 * Identify Controller data structure in CMIC (Controller Multi-path I/O and 73 * Namesapce Sharing Capabilities). 74 * 75 * - `aerl` 76 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number 77 * of concurrently outstanding Asynchronous Event Request commands support 78 * by the controller. This is a 0's based value. 79 * 80 * - `aer_max_queued` 81 * This is the maximum number of events that the device will enqueue for 82 * completion when there are no outstanding AERs. When the maximum number of 83 * enqueued events are reached, subsequent events will be dropped. 84 * 85 * - `mdts` 86 * Indicates the maximum data transfer size for a command that transfers data 87 * between host-accessible memory and the controller. The value is specified 88 * as a power of two (2^n) and is in units of the minimum memory page size 89 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB). 90 * 91 * - `vsl` 92 * Indicates the maximum data size limit for the Verify command. Like `mdts`, 93 * this value is specified as a power of two (2^n) and is in units of the 94 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512 95 * KiB). 96 * 97 * - `zoned.zasl` 98 * Indicates the maximum data transfer size for the Zone Append command. Like 99 * `mdts`, the value is specified as a power of two (2^n) and is in units of 100 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e. 101 * defaulting to the value of `mdts`). 102 * 103 * nvme namespace device parameters 104 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 105 * - `shared` 106 * When the parent nvme device (as defined explicitly by the 'bus' parameter 107 * or implicitly by the most recently defined NvmeBus) is linked to an 108 * nvme-subsys device, the namespace will be attached to all controllers in 109 * the subsystem. If set to 'off' (the default), the namespace will remain a 110 * private namespace and may only be attached to a single controller at a 111 * time. 112 * 113 * - `detached` 114 * This parameter is only valid together with the `subsys` parameter. If left 115 * at the default value (`false/off`), the namespace will be attached to all 116 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the 117 * namespace will be be available in the subsystem not not attached to any 118 * controllers. 119 * 120 * Setting `zoned` to true selects Zoned Command Set at the namespace. 121 * In this case, the following namespace properties are available to configure 122 * zoned operation: 123 * zoned.zone_size=<zone size in bytes, default: 128MiB> 124 * The number may be followed by K, M, G as in kilo-, mega- or giga-. 125 * 126 * zoned.zone_capacity=<zone capacity in bytes, default: zone size> 127 * The value 0 (default) forces zone capacity to be the same as zone 128 * size. The value of this property may not exceed zone size. 129 * 130 * zoned.descr_ext_size=<zone descriptor extension size, default 0> 131 * This value needs to be specified in 64B units. If it is zero, 132 * namespace(s) will not support zone descriptor extensions. 133 * 134 * zoned.max_active=<Maximum Active Resources (zones), default: 0> 135 * The default value means there is no limit to the number of 136 * concurrently active zones. 137 * 138 * zoned.max_open=<Maximum Open Resources (zones), default: 0> 139 * The default value means there is no limit to the number of 140 * concurrently open zones. 141 * 142 * zoned.cross_read=<enable RAZB, default: false> 143 * Setting this property to true enables Read Across Zone Boundaries. 144 */ 145 146 #include "qemu/osdep.h" 147 #include "qemu/cutils.h" 148 #include "qemu/error-report.h" 149 #include "qemu/log.h" 150 #include "qemu/units.h" 151 #include "qapi/error.h" 152 #include "qapi/visitor.h" 153 #include "sysemu/sysemu.h" 154 #include "sysemu/block-backend.h" 155 #include "sysemu/hostmem.h" 156 #include "hw/pci/msix.h" 157 #include "migration/vmstate.h" 158 159 #include "nvme.h" 160 #include "trace.h" 161 162 #define NVME_MAX_IOQPAIRS 0xffff 163 #define NVME_DB_SIZE 4 164 #define NVME_SPEC_VER 0x00010400 165 #define NVME_CMB_BIR 2 166 #define NVME_PMR_BIR 4 167 #define NVME_TEMPERATURE 0x143 168 #define NVME_TEMPERATURE_WARNING 0x157 169 #define NVME_TEMPERATURE_CRITICAL 0x175 170 #define NVME_NUM_FW_SLOTS 1 171 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB) 172 173 #define NVME_GUEST_ERR(trace, fmt, ...) \ 174 do { \ 175 (trace_##trace)(__VA_ARGS__); \ 176 qemu_log_mask(LOG_GUEST_ERROR, #trace \ 177 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \ 178 } while (0) 179 180 static const bool nvme_feature_support[NVME_FID_MAX] = { 181 [NVME_ARBITRATION] = true, 182 [NVME_POWER_MANAGEMENT] = true, 183 [NVME_TEMPERATURE_THRESHOLD] = true, 184 [NVME_ERROR_RECOVERY] = true, 185 [NVME_VOLATILE_WRITE_CACHE] = true, 186 [NVME_NUMBER_OF_QUEUES] = true, 187 [NVME_INTERRUPT_COALESCING] = true, 188 [NVME_INTERRUPT_VECTOR_CONF] = true, 189 [NVME_WRITE_ATOMICITY] = true, 190 [NVME_ASYNCHRONOUS_EVENT_CONF] = true, 191 [NVME_TIMESTAMP] = true, 192 [NVME_COMMAND_SET_PROFILE] = true, 193 }; 194 195 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = { 196 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE, 197 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS, 198 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE, 199 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE, 200 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE, 201 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE, 202 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE, 203 }; 204 205 static const uint32_t nvme_cse_acs[256] = { 206 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP, 207 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP, 208 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP, 209 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP, 210 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP, 211 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP, 212 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP, 213 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP, 214 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP, 215 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, 216 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC, 217 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 218 }; 219 220 static const uint32_t nvme_cse_iocs_none[256]; 221 222 static const uint32_t nvme_cse_iocs_nvm[256] = { 223 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 224 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 225 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 226 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, 227 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 228 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP, 229 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 230 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, 231 }; 232 233 static const uint32_t nvme_cse_iocs_zoned[256] = { 234 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 235 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 236 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 237 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, 238 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 239 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP, 240 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 241 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, 242 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 243 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 244 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP, 245 }; 246 247 static void nvme_process_sq(void *opaque); 248 249 static uint16_t nvme_sqid(NvmeRequest *req) 250 { 251 return le16_to_cpu(req->sq->sqid); 252 } 253 254 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone, 255 NvmeZoneState state) 256 { 257 if (QTAILQ_IN_USE(zone, entry)) { 258 switch (nvme_get_zone_state(zone)) { 259 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 260 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry); 261 break; 262 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 263 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); 264 break; 265 case NVME_ZONE_STATE_CLOSED: 266 QTAILQ_REMOVE(&ns->closed_zones, zone, entry); 267 break; 268 case NVME_ZONE_STATE_FULL: 269 QTAILQ_REMOVE(&ns->full_zones, zone, entry); 270 default: 271 ; 272 } 273 } 274 275 nvme_set_zone_state(zone, state); 276 277 switch (state) { 278 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 279 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry); 280 break; 281 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 282 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry); 283 break; 284 case NVME_ZONE_STATE_CLOSED: 285 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry); 286 break; 287 case NVME_ZONE_STATE_FULL: 288 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry); 289 case NVME_ZONE_STATE_READ_ONLY: 290 break; 291 default: 292 zone->d.za = 0; 293 } 294 } 295 296 /* 297 * Check if we can open a zone without exceeding open/active limits. 298 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5). 299 */ 300 static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn) 301 { 302 if (ns->params.max_active_zones != 0 && 303 ns->nr_active_zones + act > ns->params.max_active_zones) { 304 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones); 305 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR; 306 } 307 if (ns->params.max_open_zones != 0 && 308 ns->nr_open_zones + opn > ns->params.max_open_zones) { 309 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones); 310 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR; 311 } 312 313 return NVME_SUCCESS; 314 } 315 316 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) 317 { 318 hwaddr hi, lo; 319 320 if (!n->cmb.cmse) { 321 return false; 322 } 323 324 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba; 325 hi = lo + int128_get64(n->cmb.mem.size); 326 327 return addr >= lo && addr < hi; 328 } 329 330 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) 331 { 332 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba; 333 return &n->cmb.buf[addr - base]; 334 } 335 336 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr) 337 { 338 hwaddr hi; 339 340 if (!n->pmr.cmse) { 341 return false; 342 } 343 344 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size); 345 346 return addr >= n->pmr.cba && addr < hi; 347 } 348 349 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr) 350 { 351 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba); 352 } 353 354 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) 355 { 356 hwaddr hi = addr + size - 1; 357 if (hi < addr) { 358 return 1; 359 } 360 361 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) { 362 memcpy(buf, nvme_addr_to_cmb(n, addr), size); 363 return 0; 364 } 365 366 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) { 367 memcpy(buf, nvme_addr_to_pmr(n, addr), size); 368 return 0; 369 } 370 371 return pci_dma_read(&n->parent_obj, addr, buf, size); 372 } 373 374 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size) 375 { 376 hwaddr hi = addr + size - 1; 377 if (hi < addr) { 378 return 1; 379 } 380 381 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) { 382 memcpy(nvme_addr_to_cmb(n, addr), buf, size); 383 return 0; 384 } 385 386 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) { 387 memcpy(nvme_addr_to_pmr(n, addr), buf, size); 388 return 0; 389 } 390 391 return pci_dma_write(&n->parent_obj, addr, buf, size); 392 } 393 394 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid) 395 { 396 return nsid && 397 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES); 398 } 399 400 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid) 401 { 402 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1; 403 } 404 405 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid) 406 { 407 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1; 408 } 409 410 static void nvme_inc_cq_tail(NvmeCQueue *cq) 411 { 412 cq->tail++; 413 if (cq->tail >= cq->size) { 414 cq->tail = 0; 415 cq->phase = !cq->phase; 416 } 417 } 418 419 static void nvme_inc_sq_head(NvmeSQueue *sq) 420 { 421 sq->head = (sq->head + 1) % sq->size; 422 } 423 424 static uint8_t nvme_cq_full(NvmeCQueue *cq) 425 { 426 return (cq->tail + 1) % cq->size == cq->head; 427 } 428 429 static uint8_t nvme_sq_empty(NvmeSQueue *sq) 430 { 431 return sq->head == sq->tail; 432 } 433 434 static void nvme_irq_check(NvmeCtrl *n) 435 { 436 if (msix_enabled(&(n->parent_obj))) { 437 return; 438 } 439 if (~n->bar.intms & n->irq_status) { 440 pci_irq_assert(&n->parent_obj); 441 } else { 442 pci_irq_deassert(&n->parent_obj); 443 } 444 } 445 446 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq) 447 { 448 if (cq->irq_enabled) { 449 if (msix_enabled(&(n->parent_obj))) { 450 trace_pci_nvme_irq_msix(cq->vector); 451 msix_notify(&(n->parent_obj), cq->vector); 452 } else { 453 trace_pci_nvme_irq_pin(); 454 assert(cq->vector < 32); 455 n->irq_status |= 1 << cq->vector; 456 nvme_irq_check(n); 457 } 458 } else { 459 trace_pci_nvme_irq_masked(); 460 } 461 } 462 463 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq) 464 { 465 if (cq->irq_enabled) { 466 if (msix_enabled(&(n->parent_obj))) { 467 return; 468 } else { 469 assert(cq->vector < 32); 470 n->irq_status &= ~(1 << cq->vector); 471 nvme_irq_check(n); 472 } 473 } 474 } 475 476 static void nvme_req_clear(NvmeRequest *req) 477 { 478 req->ns = NULL; 479 req->opaque = NULL; 480 req->aiocb = NULL; 481 memset(&req->cqe, 0x0, sizeof(req->cqe)); 482 req->status = NVME_SUCCESS; 483 } 484 485 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma) 486 { 487 if (dma) { 488 pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0); 489 sg->flags = NVME_SG_DMA; 490 } else { 491 qemu_iovec_init(&sg->iov, 0); 492 } 493 494 sg->flags |= NVME_SG_ALLOC; 495 } 496 497 static inline void nvme_sg_unmap(NvmeSg *sg) 498 { 499 if (!(sg->flags & NVME_SG_ALLOC)) { 500 return; 501 } 502 503 if (sg->flags & NVME_SG_DMA) { 504 qemu_sglist_destroy(&sg->qsg); 505 } else { 506 qemu_iovec_destroy(&sg->iov); 507 } 508 509 memset(sg, 0x0, sizeof(*sg)); 510 } 511 512 /* 513 * When metadata is transfered as extended LBAs, the DPTR mapped into `sg` 514 * holds both data and metadata. This function splits the data and metadata 515 * into two separate QSG/IOVs. 516 */ 517 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data, 518 NvmeSg *mdata) 519 { 520 NvmeSg *dst = data; 521 uint32_t trans_len, count = ns->lbasz; 522 uint64_t offset = 0; 523 bool dma = sg->flags & NVME_SG_DMA; 524 size_t sge_len; 525 size_t sg_len = dma ? sg->qsg.size : sg->iov.size; 526 int sg_idx = 0; 527 528 assert(sg->flags & NVME_SG_ALLOC); 529 530 while (sg_len) { 531 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len; 532 533 trans_len = MIN(sg_len, count); 534 trans_len = MIN(trans_len, sge_len - offset); 535 536 if (dst) { 537 if (dma) { 538 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset, 539 trans_len); 540 } else { 541 qemu_iovec_add(&dst->iov, 542 sg->iov.iov[sg_idx].iov_base + offset, 543 trans_len); 544 } 545 } 546 547 sg_len -= trans_len; 548 count -= trans_len; 549 offset += trans_len; 550 551 if (count == 0) { 552 dst = (dst == data) ? mdata : data; 553 count = (dst == data) ? ns->lbasz : ns->lbaf.ms; 554 } 555 556 if (sge_len == offset) { 557 offset = 0; 558 sg_idx++; 559 } 560 } 561 } 562 563 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, 564 size_t len) 565 { 566 if (!len) { 567 return NVME_SUCCESS; 568 } 569 570 trace_pci_nvme_map_addr_cmb(addr, len); 571 572 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) { 573 return NVME_DATA_TRAS_ERROR; 574 } 575 576 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len); 577 578 return NVME_SUCCESS; 579 } 580 581 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, 582 size_t len) 583 { 584 if (!len) { 585 return NVME_SUCCESS; 586 } 587 588 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) { 589 return NVME_DATA_TRAS_ERROR; 590 } 591 592 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len); 593 594 return NVME_SUCCESS; 595 } 596 597 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len) 598 { 599 bool cmb = false, pmr = false; 600 601 if (!len) { 602 return NVME_SUCCESS; 603 } 604 605 trace_pci_nvme_map_addr(addr, len); 606 607 if (nvme_addr_is_cmb(n, addr)) { 608 cmb = true; 609 } else if (nvme_addr_is_pmr(n, addr)) { 610 pmr = true; 611 } 612 613 if (cmb || pmr) { 614 if (sg->flags & NVME_SG_DMA) { 615 return NVME_INVALID_USE_OF_CMB | NVME_DNR; 616 } 617 618 if (cmb) { 619 return nvme_map_addr_cmb(n, &sg->iov, addr, len); 620 } else { 621 return nvme_map_addr_pmr(n, &sg->iov, addr, len); 622 } 623 } 624 625 if (!(sg->flags & NVME_SG_DMA)) { 626 return NVME_INVALID_USE_OF_CMB | NVME_DNR; 627 } 628 629 qemu_sglist_add(&sg->qsg, addr, len); 630 631 return NVME_SUCCESS; 632 } 633 634 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr) 635 { 636 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr)); 637 } 638 639 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1, 640 uint64_t prp2, uint32_t len) 641 { 642 hwaddr trans_len = n->page_size - (prp1 % n->page_size); 643 trans_len = MIN(len, trans_len); 644 int num_prps = (len >> n->page_bits) + 1; 645 uint16_t status; 646 int ret; 647 648 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); 649 650 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1)); 651 652 status = nvme_map_addr(n, sg, prp1, trans_len); 653 if (status) { 654 goto unmap; 655 } 656 657 len -= trans_len; 658 if (len) { 659 if (len > n->page_size) { 660 uint64_t prp_list[n->max_prp_ents]; 661 uint32_t nents, prp_trans; 662 int i = 0; 663 664 /* 665 * The first PRP list entry, pointed to by PRP2 may contain offset. 666 * Hence, we need to calculate the number of entries in based on 667 * that offset. 668 */ 669 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3; 670 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); 671 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); 672 if (ret) { 673 trace_pci_nvme_err_addr_read(prp2); 674 status = NVME_DATA_TRAS_ERROR; 675 goto unmap; 676 } 677 while (len != 0) { 678 uint64_t prp_ent = le64_to_cpu(prp_list[i]); 679 680 if (i == nents - 1 && len > n->page_size) { 681 if (unlikely(prp_ent & (n->page_size - 1))) { 682 trace_pci_nvme_err_invalid_prplist_ent(prp_ent); 683 status = NVME_INVALID_PRP_OFFSET | NVME_DNR; 684 goto unmap; 685 } 686 687 i = 0; 688 nents = (len + n->page_size - 1) >> n->page_bits; 689 nents = MIN(nents, n->max_prp_ents); 690 prp_trans = nents * sizeof(uint64_t); 691 ret = nvme_addr_read(n, prp_ent, (void *)prp_list, 692 prp_trans); 693 if (ret) { 694 trace_pci_nvme_err_addr_read(prp_ent); 695 status = NVME_DATA_TRAS_ERROR; 696 goto unmap; 697 } 698 prp_ent = le64_to_cpu(prp_list[i]); 699 } 700 701 if (unlikely(prp_ent & (n->page_size - 1))) { 702 trace_pci_nvme_err_invalid_prplist_ent(prp_ent); 703 status = NVME_INVALID_PRP_OFFSET | NVME_DNR; 704 goto unmap; 705 } 706 707 trans_len = MIN(len, n->page_size); 708 status = nvme_map_addr(n, sg, prp_ent, trans_len); 709 if (status) { 710 goto unmap; 711 } 712 713 len -= trans_len; 714 i++; 715 } 716 } else { 717 if (unlikely(prp2 & (n->page_size - 1))) { 718 trace_pci_nvme_err_invalid_prp2_align(prp2); 719 status = NVME_INVALID_PRP_OFFSET | NVME_DNR; 720 goto unmap; 721 } 722 status = nvme_map_addr(n, sg, prp2, len); 723 if (status) { 724 goto unmap; 725 } 726 } 727 } 728 729 return NVME_SUCCESS; 730 731 unmap: 732 nvme_sg_unmap(sg); 733 return status; 734 } 735 736 /* 737 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the 738 * number of bytes mapped in len. 739 */ 740 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg, 741 NvmeSglDescriptor *segment, uint64_t nsgld, 742 size_t *len, NvmeCmd *cmd) 743 { 744 dma_addr_t addr, trans_len; 745 uint32_t dlen; 746 uint16_t status; 747 748 for (int i = 0; i < nsgld; i++) { 749 uint8_t type = NVME_SGL_TYPE(segment[i].type); 750 751 switch (type) { 752 case NVME_SGL_DESCR_TYPE_BIT_BUCKET: 753 if (cmd->opcode == NVME_CMD_WRITE) { 754 continue; 755 } 756 case NVME_SGL_DESCR_TYPE_DATA_BLOCK: 757 break; 758 case NVME_SGL_DESCR_TYPE_SEGMENT: 759 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: 760 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR; 761 default: 762 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR; 763 } 764 765 dlen = le32_to_cpu(segment[i].len); 766 767 if (!dlen) { 768 continue; 769 } 770 771 if (*len == 0) { 772 /* 773 * All data has been mapped, but the SGL contains additional 774 * segments and/or descriptors. The controller might accept 775 * ignoring the rest of the SGL. 776 */ 777 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls); 778 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) { 779 break; 780 } 781 782 trace_pci_nvme_err_invalid_sgl_excess_length(dlen); 783 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 784 } 785 786 trans_len = MIN(*len, dlen); 787 788 if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) { 789 goto next; 790 } 791 792 addr = le64_to_cpu(segment[i].addr); 793 794 if (UINT64_MAX - addr < dlen) { 795 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 796 } 797 798 status = nvme_map_addr(n, sg, addr, trans_len); 799 if (status) { 800 return status; 801 } 802 803 next: 804 *len -= trans_len; 805 } 806 807 return NVME_SUCCESS; 808 } 809 810 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, 811 size_t len, NvmeCmd *cmd) 812 { 813 /* 814 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid 815 * dynamically allocating a potentially huge SGL. The spec allows the SGL 816 * to be larger (as in number of bytes required to describe the SGL 817 * descriptors and segment chain) than the command transfer size, so it is 818 * not bounded by MDTS. 819 */ 820 const int SEG_CHUNK_SIZE = 256; 821 822 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld; 823 uint64_t nsgld; 824 uint32_t seg_len; 825 uint16_t status; 826 hwaddr addr; 827 int ret; 828 829 sgld = &sgl; 830 addr = le64_to_cpu(sgl.addr); 831 832 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len); 833 834 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr)); 835 836 /* 837 * If the entire transfer can be described with a single data block it can 838 * be mapped directly. 839 */ 840 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { 841 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd); 842 if (status) { 843 goto unmap; 844 } 845 846 goto out; 847 } 848 849 for (;;) { 850 switch (NVME_SGL_TYPE(sgld->type)) { 851 case NVME_SGL_DESCR_TYPE_SEGMENT: 852 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: 853 break; 854 default: 855 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; 856 } 857 858 seg_len = le32_to_cpu(sgld->len); 859 860 /* check the length of the (Last) Segment descriptor */ 861 if ((!seg_len || seg_len & 0xf) && 862 (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) { 863 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; 864 } 865 866 if (UINT64_MAX - addr < seg_len) { 867 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 868 } 869 870 nsgld = seg_len / sizeof(NvmeSglDescriptor); 871 872 while (nsgld > SEG_CHUNK_SIZE) { 873 if (nvme_addr_read(n, addr, segment, sizeof(segment))) { 874 trace_pci_nvme_err_addr_read(addr); 875 status = NVME_DATA_TRAS_ERROR; 876 goto unmap; 877 } 878 879 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE, 880 &len, cmd); 881 if (status) { 882 goto unmap; 883 } 884 885 nsgld -= SEG_CHUNK_SIZE; 886 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor); 887 } 888 889 ret = nvme_addr_read(n, addr, segment, nsgld * 890 sizeof(NvmeSglDescriptor)); 891 if (ret) { 892 trace_pci_nvme_err_addr_read(addr); 893 status = NVME_DATA_TRAS_ERROR; 894 goto unmap; 895 } 896 897 last_sgld = &segment[nsgld - 1]; 898 899 /* 900 * If the segment ends with a Data Block or Bit Bucket Descriptor Type, 901 * then we are done. 902 */ 903 switch (NVME_SGL_TYPE(last_sgld->type)) { 904 case NVME_SGL_DESCR_TYPE_DATA_BLOCK: 905 case NVME_SGL_DESCR_TYPE_BIT_BUCKET: 906 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd); 907 if (status) { 908 goto unmap; 909 } 910 911 goto out; 912 913 default: 914 break; 915 } 916 917 /* 918 * If the last descriptor was not a Data Block or Bit Bucket, then the 919 * current segment must not be a Last Segment. 920 */ 921 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) { 922 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; 923 goto unmap; 924 } 925 926 sgld = last_sgld; 927 addr = le64_to_cpu(sgld->addr); 928 929 /* 930 * Do not map the last descriptor; it will be a Segment or Last Segment 931 * descriptor and is handled by the next iteration. 932 */ 933 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd); 934 if (status) { 935 goto unmap; 936 } 937 } 938 939 out: 940 /* if there is any residual left in len, the SGL was too short */ 941 if (len) { 942 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 943 goto unmap; 944 } 945 946 return NVME_SUCCESS; 947 948 unmap: 949 nvme_sg_unmap(sg); 950 return status; 951 } 952 953 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len, 954 NvmeCmd *cmd) 955 { 956 uint64_t prp1, prp2; 957 958 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) { 959 case NVME_PSDT_PRP: 960 prp1 = le64_to_cpu(cmd->dptr.prp1); 961 prp2 = le64_to_cpu(cmd->dptr.prp2); 962 963 return nvme_map_prp(n, sg, prp1, prp2, len); 964 case NVME_PSDT_SGL_MPTR_CONTIGUOUS: 965 case NVME_PSDT_SGL_MPTR_SGL: 966 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd); 967 default: 968 return NVME_INVALID_FIELD; 969 } 970 } 971 972 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len, 973 NvmeCmd *cmd) 974 { 975 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags); 976 hwaddr mptr = le64_to_cpu(cmd->mptr); 977 uint16_t status; 978 979 if (psdt == NVME_PSDT_SGL_MPTR_SGL) { 980 NvmeSglDescriptor sgl; 981 982 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) { 983 return NVME_DATA_TRAS_ERROR; 984 } 985 986 status = nvme_map_sgl(n, sg, sgl, len, cmd); 987 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) { 988 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR; 989 } 990 991 return status; 992 } 993 994 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr)); 995 status = nvme_map_addr(n, sg, mptr, len); 996 if (status) { 997 nvme_sg_unmap(sg); 998 } 999 1000 return status; 1001 } 1002 1003 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req) 1004 { 1005 NvmeNamespace *ns = req->ns; 1006 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1007 uint16_t ctrl = le16_to_cpu(rw->control); 1008 size_t len = nvme_l2b(ns, nlb); 1009 uint16_t status; 1010 1011 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && 1012 (ctrl & NVME_RW_PRINFO_PRACT && ns->lbaf.ms == 8)) { 1013 goto out; 1014 } 1015 1016 if (nvme_ns_ext(ns)) { 1017 NvmeSg sg; 1018 1019 len += nvme_m2b(ns, nlb); 1020 1021 status = nvme_map_dptr(n, &sg, len, &req->cmd); 1022 if (status) { 1023 return status; 1024 } 1025 1026 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA); 1027 nvme_sg_split(&sg, ns, &req->sg, NULL); 1028 nvme_sg_unmap(&sg); 1029 1030 return NVME_SUCCESS; 1031 } 1032 1033 out: 1034 return nvme_map_dptr(n, &req->sg, len, &req->cmd); 1035 } 1036 1037 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req) 1038 { 1039 NvmeNamespace *ns = req->ns; 1040 size_t len = nvme_m2b(ns, nlb); 1041 uint16_t status; 1042 1043 if (nvme_ns_ext(ns)) { 1044 NvmeSg sg; 1045 1046 len += nvme_l2b(ns, nlb); 1047 1048 status = nvme_map_dptr(n, &sg, len, &req->cmd); 1049 if (status) { 1050 return status; 1051 } 1052 1053 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA); 1054 nvme_sg_split(&sg, ns, NULL, &req->sg); 1055 nvme_sg_unmap(&sg); 1056 1057 return NVME_SUCCESS; 1058 } 1059 1060 return nvme_map_mptr(n, &req->sg, len, &req->cmd); 1061 } 1062 1063 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, 1064 uint32_t len, uint32_t bytes, 1065 int32_t skip_bytes, int64_t offset, 1066 NvmeTxDirection dir) 1067 { 1068 hwaddr addr; 1069 uint32_t trans_len, count = bytes; 1070 bool dma = sg->flags & NVME_SG_DMA; 1071 int64_t sge_len; 1072 int sg_idx = 0; 1073 int ret; 1074 1075 assert(sg->flags & NVME_SG_ALLOC); 1076 1077 while (len) { 1078 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len; 1079 1080 if (sge_len - offset < 0) { 1081 offset -= sge_len; 1082 sg_idx++; 1083 continue; 1084 } 1085 1086 if (sge_len == offset) { 1087 offset = 0; 1088 sg_idx++; 1089 continue; 1090 } 1091 1092 trans_len = MIN(len, count); 1093 trans_len = MIN(trans_len, sge_len - offset); 1094 1095 if (dma) { 1096 addr = sg->qsg.sg[sg_idx].base + offset; 1097 } else { 1098 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset; 1099 } 1100 1101 if (dir == NVME_TX_DIRECTION_TO_DEVICE) { 1102 ret = nvme_addr_read(n, addr, ptr, trans_len); 1103 } else { 1104 ret = nvme_addr_write(n, addr, ptr, trans_len); 1105 } 1106 1107 if (ret) { 1108 return NVME_DATA_TRAS_ERROR; 1109 } 1110 1111 ptr += trans_len; 1112 len -= trans_len; 1113 count -= trans_len; 1114 offset += trans_len; 1115 1116 if (count == 0) { 1117 count = bytes; 1118 offset += skip_bytes; 1119 } 1120 } 1121 1122 return NVME_SUCCESS; 1123 } 1124 1125 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len, 1126 NvmeTxDirection dir) 1127 { 1128 assert(sg->flags & NVME_SG_ALLOC); 1129 1130 if (sg->flags & NVME_SG_DMA) { 1131 uint64_t residual; 1132 1133 if (dir == NVME_TX_DIRECTION_TO_DEVICE) { 1134 residual = dma_buf_write(ptr, len, &sg->qsg); 1135 } else { 1136 residual = dma_buf_read(ptr, len, &sg->qsg); 1137 } 1138 1139 if (unlikely(residual)) { 1140 trace_pci_nvme_err_invalid_dma(); 1141 return NVME_INVALID_FIELD | NVME_DNR; 1142 } 1143 } else { 1144 size_t bytes; 1145 1146 if (dir == NVME_TX_DIRECTION_TO_DEVICE) { 1147 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len); 1148 } else { 1149 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len); 1150 } 1151 1152 if (unlikely(bytes != len)) { 1153 trace_pci_nvme_err_invalid_dma(); 1154 return NVME_INVALID_FIELD | NVME_DNR; 1155 } 1156 } 1157 1158 return NVME_SUCCESS; 1159 } 1160 1161 static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len, 1162 NvmeRequest *req) 1163 { 1164 uint16_t status; 1165 1166 status = nvme_map_dptr(n, &req->sg, len, &req->cmd); 1167 if (status) { 1168 return status; 1169 } 1170 1171 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE); 1172 } 1173 1174 static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len, 1175 NvmeRequest *req) 1176 { 1177 uint16_t status; 1178 1179 status = nvme_map_dptr(n, &req->sg, len, &req->cmd); 1180 if (status) { 1181 return status; 1182 } 1183 1184 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE); 1185 } 1186 1187 uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len, 1188 NvmeTxDirection dir, NvmeRequest *req) 1189 { 1190 NvmeNamespace *ns = req->ns; 1191 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1192 uint16_t ctrl = le16_to_cpu(rw->control); 1193 1194 if (nvme_ns_ext(ns) && 1195 !(ctrl & NVME_RW_PRINFO_PRACT && ns->lbaf.ms == 8)) { 1196 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz, 1197 ns->lbaf.ms, 0, dir); 1198 } 1199 1200 return nvme_tx(n, &req->sg, ptr, len, dir); 1201 } 1202 1203 uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len, 1204 NvmeTxDirection dir, NvmeRequest *req) 1205 { 1206 NvmeNamespace *ns = req->ns; 1207 uint16_t status; 1208 1209 if (nvme_ns_ext(ns)) { 1210 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms, 1211 ns->lbasz, ns->lbasz, dir); 1212 } 1213 1214 nvme_sg_unmap(&req->sg); 1215 1216 status = nvme_map_mptr(n, &req->sg, len, &req->cmd); 1217 if (status) { 1218 return status; 1219 } 1220 1221 return nvme_tx(n, &req->sg, ptr, len, dir); 1222 } 1223 1224 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset, 1225 BlockCompletionFunc *cb, NvmeRequest *req) 1226 { 1227 assert(req->sg.flags & NVME_SG_ALLOC); 1228 1229 if (req->sg.flags & NVME_SG_DMA) { 1230 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE, 1231 cb, req); 1232 } else { 1233 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req); 1234 } 1235 } 1236 1237 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset, 1238 BlockCompletionFunc *cb, NvmeRequest *req) 1239 { 1240 assert(req->sg.flags & NVME_SG_ALLOC); 1241 1242 if (req->sg.flags & NVME_SG_DMA) { 1243 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE, 1244 cb, req); 1245 } else { 1246 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req); 1247 } 1248 } 1249 1250 static void nvme_post_cqes(void *opaque) 1251 { 1252 NvmeCQueue *cq = opaque; 1253 NvmeCtrl *n = cq->ctrl; 1254 NvmeRequest *req, *next; 1255 int ret; 1256 1257 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) { 1258 NvmeSQueue *sq; 1259 hwaddr addr; 1260 1261 if (nvme_cq_full(cq)) { 1262 break; 1263 } 1264 1265 sq = req->sq; 1266 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase); 1267 req->cqe.sq_id = cpu_to_le16(sq->sqid); 1268 req->cqe.sq_head = cpu_to_le16(sq->head); 1269 addr = cq->dma_addr + cq->tail * n->cqe_size; 1270 ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe, 1271 sizeof(req->cqe)); 1272 if (ret) { 1273 trace_pci_nvme_err_addr_write(addr); 1274 trace_pci_nvme_err_cfs(); 1275 n->bar.csts = NVME_CSTS_FAILED; 1276 break; 1277 } 1278 QTAILQ_REMOVE(&cq->req_list, req, entry); 1279 nvme_inc_cq_tail(cq); 1280 nvme_sg_unmap(&req->sg); 1281 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); 1282 } 1283 if (cq->tail != cq->head) { 1284 nvme_irq_assert(n, cq); 1285 } 1286 } 1287 1288 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) 1289 { 1290 assert(cq->cqid == req->sq->cqid); 1291 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid, 1292 req->status); 1293 1294 if (req->status) { 1295 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns), 1296 req->status, req->cmd.opcode); 1297 } 1298 1299 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); 1300 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); 1301 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); 1302 } 1303 1304 static void nvme_process_aers(void *opaque) 1305 { 1306 NvmeCtrl *n = opaque; 1307 NvmeAsyncEvent *event, *next; 1308 1309 trace_pci_nvme_process_aers(n->aer_queued); 1310 1311 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) { 1312 NvmeRequest *req; 1313 NvmeAerResult *result; 1314 1315 /* can't post cqe if there is nothing to complete */ 1316 if (!n->outstanding_aers) { 1317 trace_pci_nvme_no_outstanding_aers(); 1318 break; 1319 } 1320 1321 /* ignore if masked (cqe posted, but event not cleared) */ 1322 if (n->aer_mask & (1 << event->result.event_type)) { 1323 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask); 1324 continue; 1325 } 1326 1327 QTAILQ_REMOVE(&n->aer_queue, event, entry); 1328 n->aer_queued--; 1329 1330 n->aer_mask |= 1 << event->result.event_type; 1331 n->outstanding_aers--; 1332 1333 req = n->aer_reqs[n->outstanding_aers]; 1334 1335 result = (NvmeAerResult *) &req->cqe.result; 1336 result->event_type = event->result.event_type; 1337 result->event_info = event->result.event_info; 1338 result->log_page = event->result.log_page; 1339 g_free(event); 1340 1341 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info, 1342 result->log_page); 1343 1344 nvme_enqueue_req_completion(&n->admin_cq, req); 1345 } 1346 } 1347 1348 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type, 1349 uint8_t event_info, uint8_t log_page) 1350 { 1351 NvmeAsyncEvent *event; 1352 1353 trace_pci_nvme_enqueue_event(event_type, event_info, log_page); 1354 1355 if (n->aer_queued == n->params.aer_max_queued) { 1356 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued); 1357 return; 1358 } 1359 1360 event = g_new(NvmeAsyncEvent, 1); 1361 event->result = (NvmeAerResult) { 1362 .event_type = event_type, 1363 .event_info = event_info, 1364 .log_page = log_page, 1365 }; 1366 1367 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry); 1368 n->aer_queued++; 1369 1370 nvme_process_aers(n); 1371 } 1372 1373 static void nvme_smart_event(NvmeCtrl *n, uint8_t event) 1374 { 1375 uint8_t aer_info; 1376 1377 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */ 1378 if (!(NVME_AEC_SMART(n->features.async_config) & event)) { 1379 return; 1380 } 1381 1382 switch (event) { 1383 case NVME_SMART_SPARE: 1384 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH; 1385 break; 1386 case NVME_SMART_TEMPERATURE: 1387 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH; 1388 break; 1389 case NVME_SMART_RELIABILITY: 1390 case NVME_SMART_MEDIA_READ_ONLY: 1391 case NVME_SMART_FAILED_VOLATILE_MEDIA: 1392 case NVME_SMART_PMR_UNRELIABLE: 1393 aer_info = NVME_AER_INFO_SMART_RELIABILITY; 1394 break; 1395 default: 1396 return; 1397 } 1398 1399 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO); 1400 } 1401 1402 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type) 1403 { 1404 n->aer_mask &= ~(1 << event_type); 1405 if (!QTAILQ_EMPTY(&n->aer_queue)) { 1406 nvme_process_aers(n); 1407 } 1408 } 1409 1410 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len) 1411 { 1412 uint8_t mdts = n->params.mdts; 1413 1414 if (mdts && len > n->page_size << mdts) { 1415 trace_pci_nvme_err_mdts(len); 1416 return NVME_INVALID_FIELD | NVME_DNR; 1417 } 1418 1419 return NVME_SUCCESS; 1420 } 1421 1422 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba, 1423 uint32_t nlb) 1424 { 1425 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze); 1426 1427 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) { 1428 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze); 1429 return NVME_LBA_RANGE | NVME_DNR; 1430 } 1431 1432 return NVME_SUCCESS; 1433 } 1434 1435 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba, 1436 uint32_t nlb) 1437 { 1438 BlockDriverState *bs = blk_bs(ns->blkconf.blk); 1439 1440 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb); 1441 int64_t offset = nvme_l2b(ns, slba); 1442 bool zeroed; 1443 int ret; 1444 1445 Error *local_err = NULL; 1446 1447 /* 1448 * `pnum` holds the number of bytes after offset that shares the same 1449 * allocation status as the byte at offset. If `pnum` is different from 1450 * `bytes`, we should check the allocation status of the next range and 1451 * continue this until all bytes have been checked. 1452 */ 1453 do { 1454 bytes -= pnum; 1455 1456 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL); 1457 if (ret < 0) { 1458 error_setg_errno(&local_err, -ret, "unable to get block status"); 1459 error_report_err(local_err); 1460 1461 return NVME_INTERNAL_DEV_ERROR; 1462 } 1463 1464 zeroed = !!(ret & BDRV_BLOCK_ZERO); 1465 1466 trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed); 1467 1468 if (zeroed) { 1469 return NVME_DULB; 1470 } 1471 1472 offset += pnum; 1473 } while (pnum != bytes); 1474 1475 return NVME_SUCCESS; 1476 } 1477 1478 static void nvme_aio_err(NvmeRequest *req, int ret) 1479 { 1480 uint16_t status = NVME_SUCCESS; 1481 Error *local_err = NULL; 1482 1483 switch (req->cmd.opcode) { 1484 case NVME_CMD_READ: 1485 status = NVME_UNRECOVERED_READ; 1486 break; 1487 case NVME_CMD_FLUSH: 1488 case NVME_CMD_WRITE: 1489 case NVME_CMD_WRITE_ZEROES: 1490 case NVME_CMD_ZONE_APPEND: 1491 status = NVME_WRITE_FAULT; 1492 break; 1493 default: 1494 status = NVME_INTERNAL_DEV_ERROR; 1495 break; 1496 } 1497 1498 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status); 1499 1500 error_setg_errno(&local_err, -ret, "aio failed"); 1501 error_report_err(local_err); 1502 1503 /* 1504 * Set the command status code to the first encountered error but allow a 1505 * subsequent Internal Device Error to trump it. 1506 */ 1507 if (req->status && status != NVME_INTERNAL_DEV_ERROR) { 1508 return; 1509 } 1510 1511 req->status = status; 1512 } 1513 1514 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba) 1515 { 1516 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 : 1517 slba / ns->zone_size; 1518 } 1519 1520 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba) 1521 { 1522 uint32_t zone_idx = nvme_zone_idx(ns, slba); 1523 1524 assert(zone_idx < ns->num_zones); 1525 return &ns->zone_array[zone_idx]; 1526 } 1527 1528 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone) 1529 { 1530 uint64_t zslba = zone->d.zslba; 1531 1532 switch (nvme_get_zone_state(zone)) { 1533 case NVME_ZONE_STATE_EMPTY: 1534 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1535 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1536 case NVME_ZONE_STATE_CLOSED: 1537 return NVME_SUCCESS; 1538 case NVME_ZONE_STATE_FULL: 1539 trace_pci_nvme_err_zone_is_full(zslba); 1540 return NVME_ZONE_FULL; 1541 case NVME_ZONE_STATE_OFFLINE: 1542 trace_pci_nvme_err_zone_is_offline(zslba); 1543 return NVME_ZONE_OFFLINE; 1544 case NVME_ZONE_STATE_READ_ONLY: 1545 trace_pci_nvme_err_zone_is_read_only(zslba); 1546 return NVME_ZONE_READ_ONLY; 1547 default: 1548 assert(false); 1549 } 1550 1551 return NVME_INTERNAL_DEV_ERROR; 1552 } 1553 1554 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone, 1555 uint64_t slba, uint32_t nlb) 1556 { 1557 uint64_t zcap = nvme_zone_wr_boundary(zone); 1558 uint16_t status; 1559 1560 status = nvme_check_zone_state_for_write(zone); 1561 if (status) { 1562 return status; 1563 } 1564 1565 if (unlikely(slba != zone->w_ptr)) { 1566 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr); 1567 return NVME_ZONE_INVALID_WRITE; 1568 } 1569 1570 if (unlikely((slba + nlb) > zcap)) { 1571 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap); 1572 return NVME_ZONE_BOUNDARY_ERROR; 1573 } 1574 1575 return NVME_SUCCESS; 1576 } 1577 1578 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) 1579 { 1580 switch (nvme_get_zone_state(zone)) { 1581 case NVME_ZONE_STATE_EMPTY: 1582 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1583 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1584 case NVME_ZONE_STATE_FULL: 1585 case NVME_ZONE_STATE_CLOSED: 1586 case NVME_ZONE_STATE_READ_ONLY: 1587 return NVME_SUCCESS; 1588 case NVME_ZONE_STATE_OFFLINE: 1589 trace_pci_nvme_err_zone_is_offline(zone->d.zslba); 1590 return NVME_ZONE_OFFLINE; 1591 default: 1592 assert(false); 1593 } 1594 1595 return NVME_INTERNAL_DEV_ERROR; 1596 } 1597 1598 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, 1599 uint32_t nlb) 1600 { 1601 NvmeZone *zone = nvme_get_zone_by_slba(ns, slba); 1602 uint64_t bndry = nvme_zone_rd_boundary(ns, zone); 1603 uint64_t end = slba + nlb; 1604 uint16_t status; 1605 1606 status = nvme_check_zone_state_for_read(zone); 1607 if (status) { 1608 ; 1609 } else if (unlikely(end > bndry)) { 1610 if (!ns->params.cross_zone_read) { 1611 status = NVME_ZONE_BOUNDARY_ERROR; 1612 } else { 1613 /* 1614 * Read across zone boundary - check that all subsequent 1615 * zones that are being read have an appropriate state. 1616 */ 1617 do { 1618 zone++; 1619 status = nvme_check_zone_state_for_read(zone); 1620 if (status) { 1621 break; 1622 } 1623 } while (end > nvme_zone_rd_boundary(ns, zone)); 1624 } 1625 } 1626 1627 return status; 1628 } 1629 1630 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone) 1631 { 1632 switch (nvme_get_zone_state(zone)) { 1633 case NVME_ZONE_STATE_FULL: 1634 return NVME_SUCCESS; 1635 1636 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1637 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1638 nvme_aor_dec_open(ns); 1639 /* fallthrough */ 1640 case NVME_ZONE_STATE_CLOSED: 1641 nvme_aor_dec_active(ns); 1642 /* fallthrough */ 1643 case NVME_ZONE_STATE_EMPTY: 1644 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); 1645 return NVME_SUCCESS; 1646 1647 default: 1648 return NVME_ZONE_INVAL_TRANSITION; 1649 } 1650 } 1651 1652 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone) 1653 { 1654 switch (nvme_get_zone_state(zone)) { 1655 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1656 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1657 nvme_aor_dec_open(ns); 1658 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); 1659 /* fall through */ 1660 case NVME_ZONE_STATE_CLOSED: 1661 return NVME_SUCCESS; 1662 1663 default: 1664 return NVME_ZONE_INVAL_TRANSITION; 1665 } 1666 } 1667 1668 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns) 1669 { 1670 NvmeZone *zone; 1671 1672 if (ns->params.max_open_zones && 1673 ns->nr_open_zones == ns->params.max_open_zones) { 1674 zone = QTAILQ_FIRST(&ns->imp_open_zones); 1675 if (zone) { 1676 /* 1677 * Automatically close this implicitly open zone. 1678 */ 1679 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); 1680 nvme_zrm_close(ns, zone); 1681 } 1682 } 1683 } 1684 1685 enum { 1686 NVME_ZRM_AUTO = 1 << 0, 1687 }; 1688 1689 static uint16_t nvme_zrm_open_flags(NvmeNamespace *ns, NvmeZone *zone, 1690 int flags) 1691 { 1692 int act = 0; 1693 uint16_t status; 1694 1695 switch (nvme_get_zone_state(zone)) { 1696 case NVME_ZONE_STATE_EMPTY: 1697 act = 1; 1698 1699 /* fallthrough */ 1700 1701 case NVME_ZONE_STATE_CLOSED: 1702 nvme_zrm_auto_transition_zone(ns); 1703 status = nvme_aor_check(ns, act, 1); 1704 if (status) { 1705 return status; 1706 } 1707 1708 if (act) { 1709 nvme_aor_inc_active(ns); 1710 } 1711 1712 nvme_aor_inc_open(ns); 1713 1714 if (flags & NVME_ZRM_AUTO) { 1715 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN); 1716 return NVME_SUCCESS; 1717 } 1718 1719 /* fallthrough */ 1720 1721 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1722 if (flags & NVME_ZRM_AUTO) { 1723 return NVME_SUCCESS; 1724 } 1725 1726 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN); 1727 1728 /* fallthrough */ 1729 1730 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1731 return NVME_SUCCESS; 1732 1733 default: 1734 return NVME_ZONE_INVAL_TRANSITION; 1735 } 1736 } 1737 1738 static inline uint16_t nvme_zrm_auto(NvmeNamespace *ns, NvmeZone *zone) 1739 { 1740 return nvme_zrm_open_flags(ns, zone, NVME_ZRM_AUTO); 1741 } 1742 1743 static inline uint16_t nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone) 1744 { 1745 return nvme_zrm_open_flags(ns, zone, 0); 1746 } 1747 1748 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, 1749 uint32_t nlb) 1750 { 1751 zone->d.wp += nlb; 1752 1753 if (zone->d.wp == nvme_zone_wr_boundary(zone)) { 1754 nvme_zrm_finish(ns, zone); 1755 } 1756 } 1757 1758 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req) 1759 { 1760 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1761 NvmeZone *zone; 1762 uint64_t slba; 1763 uint32_t nlb; 1764 1765 slba = le64_to_cpu(rw->slba); 1766 nlb = le16_to_cpu(rw->nlb) + 1; 1767 zone = nvme_get_zone_by_slba(ns, slba); 1768 1769 nvme_advance_zone_wp(ns, zone, nlb); 1770 } 1771 1772 static inline bool nvme_is_write(NvmeRequest *req) 1773 { 1774 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1775 1776 return rw->opcode == NVME_CMD_WRITE || 1777 rw->opcode == NVME_CMD_ZONE_APPEND || 1778 rw->opcode == NVME_CMD_WRITE_ZEROES; 1779 } 1780 1781 static void nvme_misc_cb(void *opaque, int ret) 1782 { 1783 NvmeRequest *req = opaque; 1784 NvmeNamespace *ns = req->ns; 1785 1786 BlockBackend *blk = ns->blkconf.blk; 1787 BlockAcctCookie *acct = &req->acct; 1788 BlockAcctStats *stats = blk_get_stats(blk); 1789 1790 trace_pci_nvme_misc_cb(nvme_cid(req), blk_name(blk)); 1791 1792 if (ret) { 1793 block_acct_failed(stats, acct); 1794 nvme_aio_err(req, ret); 1795 } else { 1796 block_acct_done(stats, acct); 1797 } 1798 1799 nvme_enqueue_req_completion(nvme_cq(req), req); 1800 } 1801 1802 void nvme_rw_complete_cb(void *opaque, int ret) 1803 { 1804 NvmeRequest *req = opaque; 1805 NvmeNamespace *ns = req->ns; 1806 BlockBackend *blk = ns->blkconf.blk; 1807 BlockAcctCookie *acct = &req->acct; 1808 BlockAcctStats *stats = blk_get_stats(blk); 1809 1810 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk)); 1811 1812 if (ret) { 1813 block_acct_failed(stats, acct); 1814 nvme_aio_err(req, ret); 1815 } else { 1816 block_acct_done(stats, acct); 1817 } 1818 1819 if (ns->params.zoned && nvme_is_write(req)) { 1820 nvme_finalize_zoned_write(ns, req); 1821 } 1822 1823 nvme_enqueue_req_completion(nvme_cq(req), req); 1824 } 1825 1826 static void nvme_rw_cb(void *opaque, int ret) 1827 { 1828 NvmeRequest *req = opaque; 1829 NvmeNamespace *ns = req->ns; 1830 1831 BlockBackend *blk = ns->blkconf.blk; 1832 1833 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk)); 1834 1835 if (ret) { 1836 goto out; 1837 } 1838 1839 if (ns->lbaf.ms) { 1840 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1841 uint64_t slba = le64_to_cpu(rw->slba); 1842 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; 1843 uint64_t offset = nvme_moff(ns, slba); 1844 1845 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) { 1846 size_t mlen = nvme_m2b(ns, nlb); 1847 1848 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen, 1849 BDRV_REQ_MAY_UNMAP, 1850 nvme_rw_complete_cb, req); 1851 return; 1852 } 1853 1854 if (nvme_ns_ext(ns) || req->cmd.mptr) { 1855 uint16_t status; 1856 1857 nvme_sg_unmap(&req->sg); 1858 status = nvme_map_mdata(nvme_ctrl(req), nlb, req); 1859 if (status) { 1860 ret = -EFAULT; 1861 goto out; 1862 } 1863 1864 if (req->cmd.opcode == NVME_CMD_READ) { 1865 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req); 1866 } 1867 1868 return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req); 1869 } 1870 } 1871 1872 out: 1873 nvme_rw_complete_cb(req, ret); 1874 } 1875 1876 struct nvme_aio_format_ctx { 1877 NvmeRequest *req; 1878 NvmeNamespace *ns; 1879 1880 /* number of outstanding write zeroes for this namespace */ 1881 int *count; 1882 }; 1883 1884 static void nvme_aio_format_cb(void *opaque, int ret) 1885 { 1886 struct nvme_aio_format_ctx *ctx = opaque; 1887 NvmeRequest *req = ctx->req; 1888 NvmeNamespace *ns = ctx->ns; 1889 uintptr_t *num_formats = (uintptr_t *)&req->opaque; 1890 int *count = ctx->count; 1891 1892 g_free(ctx); 1893 1894 if (ret) { 1895 nvme_aio_err(req, ret); 1896 } 1897 1898 if (--(*count)) { 1899 return; 1900 } 1901 1902 g_free(count); 1903 ns->status = 0x0; 1904 1905 if (--(*num_formats)) { 1906 return; 1907 } 1908 1909 nvme_enqueue_req_completion(nvme_cq(req), req); 1910 } 1911 1912 struct nvme_aio_flush_ctx { 1913 NvmeRequest *req; 1914 NvmeNamespace *ns; 1915 BlockAcctCookie acct; 1916 }; 1917 1918 static void nvme_aio_flush_cb(void *opaque, int ret) 1919 { 1920 struct nvme_aio_flush_ctx *ctx = opaque; 1921 NvmeRequest *req = ctx->req; 1922 uintptr_t *num_flushes = (uintptr_t *)&req->opaque; 1923 1924 BlockBackend *blk = ctx->ns->blkconf.blk; 1925 BlockAcctCookie *acct = &ctx->acct; 1926 BlockAcctStats *stats = blk_get_stats(blk); 1927 1928 trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk)); 1929 1930 if (!ret) { 1931 block_acct_done(stats, acct); 1932 } else { 1933 block_acct_failed(stats, acct); 1934 nvme_aio_err(req, ret); 1935 } 1936 1937 (*num_flushes)--; 1938 g_free(ctx); 1939 1940 if (*num_flushes) { 1941 return; 1942 } 1943 1944 nvme_enqueue_req_completion(nvme_cq(req), req); 1945 } 1946 1947 static void nvme_verify_cb(void *opaque, int ret) 1948 { 1949 NvmeBounceContext *ctx = opaque; 1950 NvmeRequest *req = ctx->req; 1951 NvmeNamespace *ns = req->ns; 1952 BlockBackend *blk = ns->blkconf.blk; 1953 BlockAcctCookie *acct = &req->acct; 1954 BlockAcctStats *stats = blk_get_stats(blk); 1955 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1956 uint64_t slba = le64_to_cpu(rw->slba); 1957 uint16_t ctrl = le16_to_cpu(rw->control); 1958 uint16_t apptag = le16_to_cpu(rw->apptag); 1959 uint16_t appmask = le16_to_cpu(rw->appmask); 1960 uint32_t reftag = le32_to_cpu(rw->reftag); 1961 uint16_t status; 1962 1963 trace_pci_nvme_verify_cb(nvme_cid(req), NVME_RW_PRINFO(ctrl), apptag, 1964 appmask, reftag); 1965 1966 if (ret) { 1967 block_acct_failed(stats, acct); 1968 nvme_aio_err(req, ret); 1969 goto out; 1970 } 1971 1972 block_acct_done(stats, acct); 1973 1974 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 1975 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce, 1976 ctx->mdata.iov.size, slba); 1977 if (status) { 1978 req->status = status; 1979 goto out; 1980 } 1981 1982 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size, 1983 ctx->mdata.bounce, ctx->mdata.iov.size, 1984 ctrl, slba, apptag, appmask, reftag); 1985 } 1986 1987 out: 1988 qemu_iovec_destroy(&ctx->data.iov); 1989 g_free(ctx->data.bounce); 1990 1991 qemu_iovec_destroy(&ctx->mdata.iov); 1992 g_free(ctx->mdata.bounce); 1993 1994 g_free(ctx); 1995 1996 nvme_enqueue_req_completion(nvme_cq(req), req); 1997 } 1998 1999 2000 static void nvme_verify_mdata_in_cb(void *opaque, int ret) 2001 { 2002 NvmeBounceContext *ctx = opaque; 2003 NvmeRequest *req = ctx->req; 2004 NvmeNamespace *ns = req->ns; 2005 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2006 uint64_t slba = le64_to_cpu(rw->slba); 2007 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2008 size_t mlen = nvme_m2b(ns, nlb); 2009 uint64_t offset = nvme_moff(ns, slba); 2010 BlockBackend *blk = ns->blkconf.blk; 2011 2012 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk)); 2013 2014 if (ret) { 2015 goto out; 2016 } 2017 2018 ctx->mdata.bounce = g_malloc(mlen); 2019 2020 qemu_iovec_reset(&ctx->mdata.iov); 2021 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen); 2022 2023 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0, 2024 nvme_verify_cb, ctx); 2025 return; 2026 2027 out: 2028 nvme_verify_cb(ctx, ret); 2029 } 2030 2031 static void nvme_aio_discard_cb(void *opaque, int ret) 2032 { 2033 NvmeRequest *req = opaque; 2034 uintptr_t *discards = (uintptr_t *)&req->opaque; 2035 2036 trace_pci_nvme_aio_discard_cb(nvme_cid(req)); 2037 2038 if (ret) { 2039 nvme_aio_err(req, ret); 2040 } 2041 2042 (*discards)--; 2043 2044 if (*discards) { 2045 return; 2046 } 2047 2048 nvme_enqueue_req_completion(nvme_cq(req), req); 2049 } 2050 2051 struct nvme_zone_reset_ctx { 2052 NvmeRequest *req; 2053 NvmeZone *zone; 2054 }; 2055 2056 static void nvme_aio_zone_reset_complete_cb(void *opaque, int ret) 2057 { 2058 struct nvme_zone_reset_ctx *ctx = opaque; 2059 NvmeRequest *req = ctx->req; 2060 NvmeNamespace *ns = req->ns; 2061 NvmeZone *zone = ctx->zone; 2062 uintptr_t *resets = (uintptr_t *)&req->opaque; 2063 2064 if (ret) { 2065 nvme_aio_err(req, ret); 2066 goto out; 2067 } 2068 2069 switch (nvme_get_zone_state(zone)) { 2070 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 2071 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 2072 nvme_aor_dec_open(ns); 2073 /* fall through */ 2074 case NVME_ZONE_STATE_CLOSED: 2075 nvme_aor_dec_active(ns); 2076 /* fall through */ 2077 case NVME_ZONE_STATE_FULL: 2078 zone->w_ptr = zone->d.zslba; 2079 zone->d.wp = zone->w_ptr; 2080 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY); 2081 /* fall through */ 2082 default: 2083 break; 2084 } 2085 2086 out: 2087 g_free(ctx); 2088 2089 (*resets)--; 2090 2091 if (*resets) { 2092 return; 2093 } 2094 2095 nvme_enqueue_req_completion(nvme_cq(req), req); 2096 } 2097 2098 static void nvme_aio_zone_reset_cb(void *opaque, int ret) 2099 { 2100 struct nvme_zone_reset_ctx *ctx = opaque; 2101 NvmeRequest *req = ctx->req; 2102 NvmeNamespace *ns = req->ns; 2103 NvmeZone *zone = ctx->zone; 2104 2105 trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba); 2106 2107 if (ret) { 2108 goto out; 2109 } 2110 2111 if (ns->lbaf.ms) { 2112 int64_t offset = nvme_moff(ns, zone->d.zslba); 2113 2114 blk_aio_pwrite_zeroes(ns->blkconf.blk, offset, 2115 nvme_m2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP, 2116 nvme_aio_zone_reset_complete_cb, ctx); 2117 return; 2118 } 2119 2120 out: 2121 nvme_aio_zone_reset_complete_cb(opaque, ret); 2122 } 2123 2124 struct nvme_copy_ctx { 2125 int copies; 2126 uint8_t *bounce; 2127 uint8_t *mbounce; 2128 uint32_t nlb; 2129 NvmeCopySourceRange *ranges; 2130 }; 2131 2132 struct nvme_copy_in_ctx { 2133 NvmeRequest *req; 2134 QEMUIOVector iov; 2135 NvmeCopySourceRange *range; 2136 }; 2137 2138 static void nvme_copy_complete_cb(void *opaque, int ret) 2139 { 2140 NvmeRequest *req = opaque; 2141 NvmeNamespace *ns = req->ns; 2142 struct nvme_copy_ctx *ctx = req->opaque; 2143 2144 if (ret) { 2145 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); 2146 nvme_aio_err(req, ret); 2147 goto out; 2148 } 2149 2150 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); 2151 2152 out: 2153 if (ns->params.zoned) { 2154 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; 2155 uint64_t sdlba = le64_to_cpu(copy->sdlba); 2156 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba); 2157 2158 nvme_advance_zone_wp(ns, zone, ctx->nlb); 2159 } 2160 2161 g_free(ctx->bounce); 2162 g_free(ctx->mbounce); 2163 g_free(ctx); 2164 2165 nvme_enqueue_req_completion(nvme_cq(req), req); 2166 } 2167 2168 static void nvme_copy_cb(void *opaque, int ret) 2169 { 2170 NvmeRequest *req = opaque; 2171 NvmeNamespace *ns = req->ns; 2172 struct nvme_copy_ctx *ctx = req->opaque; 2173 2174 trace_pci_nvme_copy_cb(nvme_cid(req)); 2175 2176 if (ret) { 2177 goto out; 2178 } 2179 2180 if (ns->lbaf.ms) { 2181 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; 2182 uint64_t sdlba = le64_to_cpu(copy->sdlba); 2183 int64_t offset = nvme_moff(ns, sdlba); 2184 2185 qemu_iovec_reset(&req->sg.iov); 2186 qemu_iovec_add(&req->sg.iov, ctx->mbounce, nvme_m2b(ns, ctx->nlb)); 2187 2188 req->aiocb = blk_aio_pwritev(ns->blkconf.blk, offset, &req->sg.iov, 0, 2189 nvme_copy_complete_cb, req); 2190 return; 2191 } 2192 2193 out: 2194 nvme_copy_complete_cb(opaque, ret); 2195 } 2196 2197 static void nvme_copy_in_complete(NvmeRequest *req) 2198 { 2199 NvmeNamespace *ns = req->ns; 2200 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; 2201 struct nvme_copy_ctx *ctx = req->opaque; 2202 uint64_t sdlba = le64_to_cpu(copy->sdlba); 2203 uint16_t status; 2204 2205 trace_pci_nvme_copy_in_complete(nvme_cid(req)); 2206 2207 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); 2208 2209 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2210 uint16_t prinfor = (copy->control[0] >> 4) & 0xf; 2211 uint16_t prinfow = (copy->control[2] >> 2) & 0xf; 2212 uint16_t nr = copy->nr + 1; 2213 NvmeCopySourceRange *range; 2214 uint64_t slba; 2215 uint32_t nlb; 2216 uint16_t apptag, appmask; 2217 uint32_t reftag; 2218 uint8_t *buf = ctx->bounce, *mbuf = ctx->mbounce; 2219 size_t len, mlen; 2220 int i; 2221 2222 /* 2223 * The dif helpers expects prinfo to be similar to the control field of 2224 * the NvmeRwCmd, so shift by 10 to fake it. 2225 */ 2226 prinfor = prinfor << 10; 2227 prinfow = prinfow << 10; 2228 2229 for (i = 0; i < nr; i++) { 2230 range = &ctx->ranges[i]; 2231 slba = le64_to_cpu(range->slba); 2232 nlb = le16_to_cpu(range->nlb) + 1; 2233 len = nvme_l2b(ns, nlb); 2234 mlen = nvme_m2b(ns, nlb); 2235 apptag = le16_to_cpu(range->apptag); 2236 appmask = le16_to_cpu(range->appmask); 2237 reftag = le32_to_cpu(range->reftag); 2238 2239 status = nvme_dif_check(ns, buf, len, mbuf, mlen, prinfor, slba, 2240 apptag, appmask, reftag); 2241 if (status) { 2242 goto invalid; 2243 } 2244 2245 buf += len; 2246 mbuf += mlen; 2247 } 2248 2249 apptag = le16_to_cpu(copy->apptag); 2250 appmask = le16_to_cpu(copy->appmask); 2251 reftag = le32_to_cpu(copy->reftag); 2252 2253 if (prinfow & NVME_RW_PRINFO_PRACT) { 2254 size_t len = nvme_l2b(ns, ctx->nlb); 2255 size_t mlen = nvme_m2b(ns, ctx->nlb); 2256 2257 status = nvme_check_prinfo(ns, prinfow, sdlba, reftag); 2258 if (status) { 2259 goto invalid; 2260 } 2261 2262 nvme_dif_pract_generate_dif(ns, ctx->bounce, len, ctx->mbounce, 2263 mlen, apptag, reftag); 2264 } else { 2265 status = nvme_dif_check(ns, ctx->bounce, len, ctx->mbounce, mlen, 2266 prinfow, sdlba, apptag, appmask, reftag); 2267 if (status) { 2268 goto invalid; 2269 } 2270 } 2271 } 2272 2273 status = nvme_check_bounds(ns, sdlba, ctx->nlb); 2274 if (status) { 2275 goto invalid; 2276 } 2277 2278 if (ns->params.zoned) { 2279 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba); 2280 2281 status = nvme_check_zone_write(ns, zone, sdlba, ctx->nlb); 2282 if (status) { 2283 goto invalid; 2284 } 2285 2286 status = nvme_zrm_auto(ns, zone); 2287 if (status) { 2288 goto invalid; 2289 } 2290 2291 zone->w_ptr += ctx->nlb; 2292 } 2293 2294 qemu_iovec_init(&req->sg.iov, 1); 2295 qemu_iovec_add(&req->sg.iov, ctx->bounce, nvme_l2b(ns, ctx->nlb)); 2296 2297 block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0, 2298 BLOCK_ACCT_WRITE); 2299 2300 req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba), 2301 &req->sg.iov, 0, nvme_copy_cb, req); 2302 2303 return; 2304 2305 invalid: 2306 req->status = status; 2307 2308 g_free(ctx->bounce); 2309 g_free(ctx); 2310 2311 nvme_enqueue_req_completion(nvme_cq(req), req); 2312 } 2313 2314 static void nvme_aio_copy_in_cb(void *opaque, int ret) 2315 { 2316 struct nvme_copy_in_ctx *in_ctx = opaque; 2317 NvmeRequest *req = in_ctx->req; 2318 NvmeNamespace *ns = req->ns; 2319 struct nvme_copy_ctx *ctx = req->opaque; 2320 2321 qemu_iovec_destroy(&in_ctx->iov); 2322 g_free(in_ctx); 2323 2324 trace_pci_nvme_aio_copy_in_cb(nvme_cid(req)); 2325 2326 if (ret) { 2327 nvme_aio_err(req, ret); 2328 } 2329 2330 ctx->copies--; 2331 2332 if (ctx->copies) { 2333 return; 2334 } 2335 2336 if (req->status) { 2337 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); 2338 2339 g_free(ctx->bounce); 2340 g_free(ctx->mbounce); 2341 g_free(ctx); 2342 2343 nvme_enqueue_req_completion(nvme_cq(req), req); 2344 2345 return; 2346 } 2347 2348 nvme_copy_in_complete(req); 2349 } 2350 2351 struct nvme_compare_ctx { 2352 struct { 2353 QEMUIOVector iov; 2354 uint8_t *bounce; 2355 } data; 2356 2357 struct { 2358 QEMUIOVector iov; 2359 uint8_t *bounce; 2360 } mdata; 2361 }; 2362 2363 static void nvme_compare_mdata_cb(void *opaque, int ret) 2364 { 2365 NvmeRequest *req = opaque; 2366 NvmeNamespace *ns = req->ns; 2367 NvmeCtrl *n = nvme_ctrl(req); 2368 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2369 uint16_t ctrl = le16_to_cpu(rw->control); 2370 uint16_t apptag = le16_to_cpu(rw->apptag); 2371 uint16_t appmask = le16_to_cpu(rw->appmask); 2372 uint32_t reftag = le32_to_cpu(rw->reftag); 2373 struct nvme_compare_ctx *ctx = req->opaque; 2374 g_autofree uint8_t *buf = NULL; 2375 BlockBackend *blk = ns->blkconf.blk; 2376 BlockAcctCookie *acct = &req->acct; 2377 BlockAcctStats *stats = blk_get_stats(blk); 2378 uint16_t status = NVME_SUCCESS; 2379 2380 trace_pci_nvme_compare_mdata_cb(nvme_cid(req)); 2381 2382 if (ret) { 2383 block_acct_failed(stats, acct); 2384 nvme_aio_err(req, ret); 2385 goto out; 2386 } 2387 2388 buf = g_malloc(ctx->mdata.iov.size); 2389 2390 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size, 2391 NVME_TX_DIRECTION_TO_DEVICE, req); 2392 if (status) { 2393 req->status = status; 2394 goto out; 2395 } 2396 2397 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2398 uint64_t slba = le64_to_cpu(rw->slba); 2399 uint8_t *bufp; 2400 uint8_t *mbufp = ctx->mdata.bounce; 2401 uint8_t *end = mbufp + ctx->mdata.iov.size; 2402 int16_t pil = 0; 2403 2404 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size, 2405 ctx->mdata.bounce, ctx->mdata.iov.size, ctrl, 2406 slba, apptag, appmask, reftag); 2407 if (status) { 2408 req->status = status; 2409 goto out; 2410 } 2411 2412 /* 2413 * When formatted with protection information, do not compare the DIF 2414 * tuple. 2415 */ 2416 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) { 2417 pil = ns->lbaf.ms - sizeof(NvmeDifTuple); 2418 } 2419 2420 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) { 2421 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) { 2422 req->status = NVME_CMP_FAILURE; 2423 goto out; 2424 } 2425 } 2426 2427 goto out; 2428 } 2429 2430 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) { 2431 req->status = NVME_CMP_FAILURE; 2432 goto out; 2433 } 2434 2435 block_acct_done(stats, acct); 2436 2437 out: 2438 qemu_iovec_destroy(&ctx->data.iov); 2439 g_free(ctx->data.bounce); 2440 2441 qemu_iovec_destroy(&ctx->mdata.iov); 2442 g_free(ctx->mdata.bounce); 2443 2444 g_free(ctx); 2445 2446 nvme_enqueue_req_completion(nvme_cq(req), req); 2447 } 2448 2449 static void nvme_compare_data_cb(void *opaque, int ret) 2450 { 2451 NvmeRequest *req = opaque; 2452 NvmeCtrl *n = nvme_ctrl(req); 2453 NvmeNamespace *ns = req->ns; 2454 BlockBackend *blk = ns->blkconf.blk; 2455 BlockAcctCookie *acct = &req->acct; 2456 BlockAcctStats *stats = blk_get_stats(blk); 2457 2458 struct nvme_compare_ctx *ctx = req->opaque; 2459 g_autofree uint8_t *buf = NULL; 2460 uint16_t status; 2461 2462 trace_pci_nvme_compare_data_cb(nvme_cid(req)); 2463 2464 if (ret) { 2465 block_acct_failed(stats, acct); 2466 nvme_aio_err(req, ret); 2467 goto out; 2468 } 2469 2470 buf = g_malloc(ctx->data.iov.size); 2471 2472 status = nvme_bounce_data(n, buf, ctx->data.iov.size, 2473 NVME_TX_DIRECTION_TO_DEVICE, req); 2474 if (status) { 2475 req->status = status; 2476 goto out; 2477 } 2478 2479 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) { 2480 req->status = NVME_CMP_FAILURE; 2481 goto out; 2482 } 2483 2484 if (ns->lbaf.ms) { 2485 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2486 uint64_t slba = le64_to_cpu(rw->slba); 2487 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2488 size_t mlen = nvme_m2b(ns, nlb); 2489 uint64_t offset = nvme_moff(ns, slba); 2490 2491 ctx->mdata.bounce = g_malloc(mlen); 2492 2493 qemu_iovec_init(&ctx->mdata.iov, 1); 2494 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen); 2495 2496 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0, 2497 nvme_compare_mdata_cb, req); 2498 return; 2499 } 2500 2501 block_acct_done(stats, acct); 2502 2503 out: 2504 qemu_iovec_destroy(&ctx->data.iov); 2505 g_free(ctx->data.bounce); 2506 g_free(ctx); 2507 2508 nvme_enqueue_req_completion(nvme_cq(req), req); 2509 } 2510 2511 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) 2512 { 2513 NvmeNamespace *ns = req->ns; 2514 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; 2515 2516 uint32_t attr = le32_to_cpu(dsm->attributes); 2517 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; 2518 2519 uint16_t status = NVME_SUCCESS; 2520 2521 trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr); 2522 2523 if (attr & NVME_DSMGMT_AD) { 2524 int64_t offset; 2525 size_t len; 2526 NvmeDsmRange range[nr]; 2527 uintptr_t *discards = (uintptr_t *)&req->opaque; 2528 2529 status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req); 2530 if (status) { 2531 return status; 2532 } 2533 2534 /* 2535 * AIO callbacks may be called immediately, so initialize discards to 1 2536 * to make sure the the callback does not complete the request before 2537 * all discards have been issued. 2538 */ 2539 *discards = 1; 2540 2541 for (int i = 0; i < nr; i++) { 2542 uint64_t slba = le64_to_cpu(range[i].slba); 2543 uint32_t nlb = le32_to_cpu(range[i].nlb); 2544 2545 if (nvme_check_bounds(ns, slba, nlb)) { 2546 continue; 2547 } 2548 2549 trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba, 2550 nlb); 2551 2552 if (nlb > n->dmrsl) { 2553 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl); 2554 } 2555 2556 offset = nvme_l2b(ns, slba); 2557 len = nvme_l2b(ns, nlb); 2558 2559 while (len) { 2560 size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len); 2561 2562 (*discards)++; 2563 2564 blk_aio_pdiscard(ns->blkconf.blk, offset, bytes, 2565 nvme_aio_discard_cb, req); 2566 2567 offset += bytes; 2568 len -= bytes; 2569 } 2570 } 2571 2572 /* account for the 1-initialization */ 2573 (*discards)--; 2574 2575 if (*discards) { 2576 status = NVME_NO_COMPLETE; 2577 } else { 2578 status = req->status; 2579 } 2580 } 2581 2582 return status; 2583 } 2584 2585 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req) 2586 { 2587 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2588 NvmeNamespace *ns = req->ns; 2589 BlockBackend *blk = ns->blkconf.blk; 2590 uint64_t slba = le64_to_cpu(rw->slba); 2591 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2592 size_t len = nvme_l2b(ns, nlb); 2593 int64_t offset = nvme_l2b(ns, slba); 2594 uint16_t ctrl = le16_to_cpu(rw->control); 2595 uint32_t reftag = le32_to_cpu(rw->reftag); 2596 NvmeBounceContext *ctx = NULL; 2597 uint16_t status; 2598 2599 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb); 2600 2601 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2602 status = nvme_check_prinfo(ns, ctrl, slba, reftag); 2603 if (status) { 2604 return status; 2605 } 2606 2607 if (ctrl & NVME_RW_PRINFO_PRACT) { 2608 return NVME_INVALID_PROT_INFO | NVME_DNR; 2609 } 2610 } 2611 2612 if (len > n->page_size << n->params.vsl) { 2613 return NVME_INVALID_FIELD | NVME_DNR; 2614 } 2615 2616 status = nvme_check_bounds(ns, slba, nlb); 2617 if (status) { 2618 return status; 2619 } 2620 2621 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 2622 status = nvme_check_dulbe(ns, slba, nlb); 2623 if (status) { 2624 return status; 2625 } 2626 } 2627 2628 ctx = g_new0(NvmeBounceContext, 1); 2629 ctx->req = req; 2630 2631 ctx->data.bounce = g_malloc(len); 2632 2633 qemu_iovec_init(&ctx->data.iov, 1); 2634 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len); 2635 2636 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size, 2637 BLOCK_ACCT_READ); 2638 2639 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0, 2640 nvme_verify_mdata_in_cb, ctx); 2641 return NVME_NO_COMPLETE; 2642 } 2643 2644 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req) 2645 { 2646 NvmeNamespace *ns = req->ns; 2647 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; 2648 2649 uint16_t nr = copy->nr + 1; 2650 uint8_t format = copy->control[0] & 0xf; 2651 2652 /* 2653 * Shift the PRINFOR/PRINFOW values by 10 to allow reusing the 2654 * NVME_RW_PRINFO constants. 2655 */ 2656 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf) << 10; 2657 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf) << 10; 2658 2659 uint32_t nlb = 0; 2660 uint8_t *bounce = NULL, *bouncep = NULL; 2661 uint8_t *mbounce = NULL, *mbouncep = NULL; 2662 struct nvme_copy_ctx *ctx; 2663 uint16_t status; 2664 int i; 2665 2666 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format); 2667 2668 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && 2669 ((prinfor & NVME_RW_PRINFO_PRACT) != (prinfow & NVME_RW_PRINFO_PRACT))) { 2670 return NVME_INVALID_FIELD | NVME_DNR; 2671 } 2672 2673 if (!(n->id_ctrl.ocfs & (1 << format))) { 2674 trace_pci_nvme_err_copy_invalid_format(format); 2675 return NVME_INVALID_FIELD | NVME_DNR; 2676 } 2677 2678 if (nr > ns->id_ns.msrc + 1) { 2679 return NVME_CMD_SIZE_LIMIT | NVME_DNR; 2680 } 2681 2682 ctx = g_new(struct nvme_copy_ctx, 1); 2683 ctx->ranges = g_new(NvmeCopySourceRange, nr); 2684 2685 status = nvme_h2c(n, (uint8_t *)ctx->ranges, 2686 nr * sizeof(NvmeCopySourceRange), req); 2687 if (status) { 2688 goto out; 2689 } 2690 2691 for (i = 0; i < nr; i++) { 2692 uint64_t slba = le64_to_cpu(ctx->ranges[i].slba); 2693 uint32_t _nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1; 2694 2695 if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) { 2696 status = NVME_CMD_SIZE_LIMIT | NVME_DNR; 2697 goto out; 2698 } 2699 2700 status = nvme_check_bounds(ns, slba, _nlb); 2701 if (status) { 2702 goto out; 2703 } 2704 2705 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 2706 status = nvme_check_dulbe(ns, slba, _nlb); 2707 if (status) { 2708 goto out; 2709 } 2710 } 2711 2712 if (ns->params.zoned) { 2713 status = nvme_check_zone_read(ns, slba, _nlb); 2714 if (status) { 2715 goto out; 2716 } 2717 } 2718 2719 nlb += _nlb; 2720 } 2721 2722 if (nlb > le32_to_cpu(ns->id_ns.mcl)) { 2723 status = NVME_CMD_SIZE_LIMIT | NVME_DNR; 2724 goto out; 2725 } 2726 2727 bounce = bouncep = g_malloc(nvme_l2b(ns, nlb)); 2728 if (ns->lbaf.ms) { 2729 mbounce = mbouncep = g_malloc(nvme_m2b(ns, nlb)); 2730 } 2731 2732 block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0, 2733 BLOCK_ACCT_READ); 2734 2735 ctx->bounce = bounce; 2736 ctx->mbounce = mbounce; 2737 ctx->nlb = nlb; 2738 ctx->copies = 1; 2739 2740 req->opaque = ctx; 2741 2742 for (i = 0; i < nr; i++) { 2743 uint64_t slba = le64_to_cpu(ctx->ranges[i].slba); 2744 uint32_t nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1; 2745 2746 size_t len = nvme_l2b(ns, nlb); 2747 int64_t offset = nvme_l2b(ns, slba); 2748 2749 trace_pci_nvme_copy_source_range(slba, nlb); 2750 2751 struct nvme_copy_in_ctx *in_ctx = g_new(struct nvme_copy_in_ctx, 1); 2752 in_ctx->req = req; 2753 2754 qemu_iovec_init(&in_ctx->iov, 1); 2755 qemu_iovec_add(&in_ctx->iov, bouncep, len); 2756 2757 ctx->copies++; 2758 2759 blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0, 2760 nvme_aio_copy_in_cb, in_ctx); 2761 2762 bouncep += len; 2763 2764 if (ns->lbaf.ms) { 2765 len = nvme_m2b(ns, nlb); 2766 offset = nvme_moff(ns, slba); 2767 2768 in_ctx = g_new(struct nvme_copy_in_ctx, 1); 2769 in_ctx->req = req; 2770 2771 qemu_iovec_init(&in_ctx->iov, 1); 2772 qemu_iovec_add(&in_ctx->iov, mbouncep, len); 2773 2774 ctx->copies++; 2775 2776 blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0, 2777 nvme_aio_copy_in_cb, in_ctx); 2778 2779 mbouncep += len; 2780 } 2781 } 2782 2783 /* account for the 1-initialization */ 2784 ctx->copies--; 2785 2786 if (!ctx->copies) { 2787 nvme_copy_in_complete(req); 2788 } 2789 2790 return NVME_NO_COMPLETE; 2791 2792 out: 2793 g_free(ctx->ranges); 2794 g_free(ctx); 2795 2796 return status; 2797 } 2798 2799 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) 2800 { 2801 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2802 NvmeNamespace *ns = req->ns; 2803 BlockBackend *blk = ns->blkconf.blk; 2804 uint64_t slba = le64_to_cpu(rw->slba); 2805 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2806 uint16_t ctrl = le16_to_cpu(rw->control); 2807 size_t data_len = nvme_l2b(ns, nlb); 2808 size_t len = data_len; 2809 int64_t offset = nvme_l2b(ns, slba); 2810 struct nvme_compare_ctx *ctx = NULL; 2811 uint16_t status; 2812 2813 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb); 2814 2815 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (ctrl & NVME_RW_PRINFO_PRACT)) { 2816 return NVME_INVALID_PROT_INFO | NVME_DNR; 2817 } 2818 2819 if (nvme_ns_ext(ns)) { 2820 len += nvme_m2b(ns, nlb); 2821 } 2822 2823 status = nvme_check_mdts(n, len); 2824 if (status) { 2825 return status; 2826 } 2827 2828 status = nvme_check_bounds(ns, slba, nlb); 2829 if (status) { 2830 return status; 2831 } 2832 2833 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 2834 status = nvme_check_dulbe(ns, slba, nlb); 2835 if (status) { 2836 return status; 2837 } 2838 } 2839 2840 status = nvme_map_dptr(n, &req->sg, len, &req->cmd); 2841 if (status) { 2842 return status; 2843 } 2844 2845 ctx = g_new(struct nvme_compare_ctx, 1); 2846 ctx->data.bounce = g_malloc(data_len); 2847 2848 req->opaque = ctx; 2849 2850 qemu_iovec_init(&ctx->data.iov, 1); 2851 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len); 2852 2853 block_acct_start(blk_get_stats(blk), &req->acct, data_len, 2854 BLOCK_ACCT_READ); 2855 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0, 2856 nvme_compare_data_cb, req); 2857 2858 return NVME_NO_COMPLETE; 2859 } 2860 2861 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) 2862 { 2863 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 2864 uintptr_t *num_flushes = (uintptr_t *)&req->opaque; 2865 uint16_t status; 2866 struct nvme_aio_flush_ctx *ctx; 2867 NvmeNamespace *ns; 2868 2869 trace_pci_nvme_flush(nvme_cid(req), nsid); 2870 2871 if (nsid != NVME_NSID_BROADCAST) { 2872 req->ns = nvme_ns(n, nsid); 2873 if (unlikely(!req->ns)) { 2874 return NVME_INVALID_FIELD | NVME_DNR; 2875 } 2876 2877 block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, 2878 BLOCK_ACCT_FLUSH); 2879 req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_misc_cb, req); 2880 return NVME_NO_COMPLETE; 2881 } 2882 2883 /* 1-initialize; see comment in nvme_dsm */ 2884 *num_flushes = 1; 2885 2886 for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) { 2887 ns = nvme_ns(n, i); 2888 if (!ns) { 2889 continue; 2890 } 2891 2892 ctx = g_new(struct nvme_aio_flush_ctx, 1); 2893 ctx->req = req; 2894 ctx->ns = ns; 2895 2896 (*num_flushes)++; 2897 2898 block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0, 2899 BLOCK_ACCT_FLUSH); 2900 blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx); 2901 } 2902 2903 /* account for the 1-initialization */ 2904 (*num_flushes)--; 2905 2906 if (*num_flushes) { 2907 status = NVME_NO_COMPLETE; 2908 } else { 2909 status = req->status; 2910 } 2911 2912 return status; 2913 } 2914 2915 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) 2916 { 2917 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2918 NvmeNamespace *ns = req->ns; 2919 uint64_t slba = le64_to_cpu(rw->slba); 2920 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; 2921 uint16_t ctrl = le16_to_cpu(rw->control); 2922 uint64_t data_size = nvme_l2b(ns, nlb); 2923 uint64_t mapped_size = data_size; 2924 uint64_t data_offset; 2925 BlockBackend *blk = ns->blkconf.blk; 2926 uint16_t status; 2927 2928 if (nvme_ns_ext(ns)) { 2929 mapped_size += nvme_m2b(ns, nlb); 2930 2931 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2932 bool pract = ctrl & NVME_RW_PRINFO_PRACT; 2933 2934 if (pract && ns->lbaf.ms == 8) { 2935 mapped_size = data_size; 2936 } 2937 } 2938 } 2939 2940 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba); 2941 2942 status = nvme_check_mdts(n, mapped_size); 2943 if (status) { 2944 goto invalid; 2945 } 2946 2947 status = nvme_check_bounds(ns, slba, nlb); 2948 if (status) { 2949 goto invalid; 2950 } 2951 2952 if (ns->params.zoned) { 2953 status = nvme_check_zone_read(ns, slba, nlb); 2954 if (status) { 2955 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status); 2956 goto invalid; 2957 } 2958 } 2959 2960 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 2961 status = nvme_check_dulbe(ns, slba, nlb); 2962 if (status) { 2963 goto invalid; 2964 } 2965 } 2966 2967 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2968 return nvme_dif_rw(n, req); 2969 } 2970 2971 status = nvme_map_data(n, nlb, req); 2972 if (status) { 2973 goto invalid; 2974 } 2975 2976 data_offset = nvme_l2b(ns, slba); 2977 2978 block_acct_start(blk_get_stats(blk), &req->acct, data_size, 2979 BLOCK_ACCT_READ); 2980 nvme_blk_read(blk, data_offset, nvme_rw_cb, req); 2981 return NVME_NO_COMPLETE; 2982 2983 invalid: 2984 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ); 2985 return status | NVME_DNR; 2986 } 2987 2988 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, 2989 bool wrz) 2990 { 2991 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2992 NvmeNamespace *ns = req->ns; 2993 uint64_t slba = le64_to_cpu(rw->slba); 2994 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; 2995 uint16_t ctrl = le16_to_cpu(rw->control); 2996 uint64_t data_size = nvme_l2b(ns, nlb); 2997 uint64_t mapped_size = data_size; 2998 uint64_t data_offset; 2999 NvmeZone *zone; 3000 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe; 3001 BlockBackend *blk = ns->blkconf.blk; 3002 uint16_t status; 3003 3004 if (nvme_ns_ext(ns)) { 3005 mapped_size += nvme_m2b(ns, nlb); 3006 3007 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3008 bool pract = ctrl & NVME_RW_PRINFO_PRACT; 3009 3010 if (pract && ns->lbaf.ms == 8) { 3011 mapped_size -= nvme_m2b(ns, nlb); 3012 } 3013 } 3014 } 3015 3016 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode), 3017 nvme_nsid(ns), nlb, mapped_size, slba); 3018 3019 if (!wrz) { 3020 status = nvme_check_mdts(n, mapped_size); 3021 if (status) { 3022 goto invalid; 3023 } 3024 } 3025 3026 status = nvme_check_bounds(ns, slba, nlb); 3027 if (status) { 3028 goto invalid; 3029 } 3030 3031 if (ns->params.zoned) { 3032 zone = nvme_get_zone_by_slba(ns, slba); 3033 3034 if (append) { 3035 bool piremap = !!(ctrl & NVME_RW_PIREMAP); 3036 3037 if (unlikely(slba != zone->d.zslba)) { 3038 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba); 3039 status = NVME_INVALID_FIELD; 3040 goto invalid; 3041 } 3042 3043 if (n->params.zasl && 3044 data_size > (uint64_t)n->page_size << n->params.zasl) { 3045 trace_pci_nvme_err_zasl(data_size); 3046 return NVME_INVALID_FIELD | NVME_DNR; 3047 } 3048 3049 slba = zone->w_ptr; 3050 rw->slba = cpu_to_le64(slba); 3051 res->slba = cpu_to_le64(slba); 3052 3053 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3054 case NVME_ID_NS_DPS_TYPE_1: 3055 if (!piremap) { 3056 return NVME_INVALID_PROT_INFO | NVME_DNR; 3057 } 3058 3059 /* fallthrough */ 3060 3061 case NVME_ID_NS_DPS_TYPE_2: 3062 if (piremap) { 3063 uint32_t reftag = le32_to_cpu(rw->reftag); 3064 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba)); 3065 } 3066 3067 break; 3068 3069 case NVME_ID_NS_DPS_TYPE_3: 3070 if (piremap) { 3071 return NVME_INVALID_PROT_INFO | NVME_DNR; 3072 } 3073 3074 break; 3075 } 3076 } 3077 3078 status = nvme_check_zone_write(ns, zone, slba, nlb); 3079 if (status) { 3080 goto invalid; 3081 } 3082 3083 status = nvme_zrm_auto(ns, zone); 3084 if (status) { 3085 goto invalid; 3086 } 3087 3088 zone->w_ptr += nlb; 3089 } 3090 3091 data_offset = nvme_l2b(ns, slba); 3092 3093 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3094 return nvme_dif_rw(n, req); 3095 } 3096 3097 if (!wrz) { 3098 status = nvme_map_data(n, nlb, req); 3099 if (status) { 3100 goto invalid; 3101 } 3102 3103 block_acct_start(blk_get_stats(blk), &req->acct, data_size, 3104 BLOCK_ACCT_WRITE); 3105 nvme_blk_write(blk, data_offset, nvme_rw_cb, req); 3106 } else { 3107 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size, 3108 BDRV_REQ_MAY_UNMAP, nvme_rw_cb, 3109 req); 3110 } 3111 3112 return NVME_NO_COMPLETE; 3113 3114 invalid: 3115 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE); 3116 return status | NVME_DNR; 3117 } 3118 3119 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req) 3120 { 3121 return nvme_do_write(n, req, false, false); 3122 } 3123 3124 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) 3125 { 3126 return nvme_do_write(n, req, false, true); 3127 } 3128 3129 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req) 3130 { 3131 return nvme_do_write(n, req, true, false); 3132 } 3133 3134 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c, 3135 uint64_t *slba, uint32_t *zone_idx) 3136 { 3137 uint32_t dw10 = le32_to_cpu(c->cdw10); 3138 uint32_t dw11 = le32_to_cpu(c->cdw11); 3139 3140 if (!ns->params.zoned) { 3141 trace_pci_nvme_err_invalid_opc(c->opcode); 3142 return NVME_INVALID_OPCODE | NVME_DNR; 3143 } 3144 3145 *slba = ((uint64_t)dw11) << 32 | dw10; 3146 if (unlikely(*slba >= ns->id_ns.nsze)) { 3147 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze); 3148 *slba = 0; 3149 return NVME_LBA_RANGE | NVME_DNR; 3150 } 3151 3152 *zone_idx = nvme_zone_idx(ns, *slba); 3153 assert(*zone_idx < ns->num_zones); 3154 3155 return NVME_SUCCESS; 3156 } 3157 3158 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState, 3159 NvmeRequest *); 3160 3161 enum NvmeZoneProcessingMask { 3162 NVME_PROC_CURRENT_ZONE = 0, 3163 NVME_PROC_OPENED_ZONES = 1 << 0, 3164 NVME_PROC_CLOSED_ZONES = 1 << 1, 3165 NVME_PROC_READ_ONLY_ZONES = 1 << 2, 3166 NVME_PROC_FULL_ZONES = 1 << 3, 3167 }; 3168 3169 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, 3170 NvmeZoneState state, NvmeRequest *req) 3171 { 3172 return nvme_zrm_open(ns, zone); 3173 } 3174 3175 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, 3176 NvmeZoneState state, NvmeRequest *req) 3177 { 3178 return nvme_zrm_close(ns, zone); 3179 } 3180 3181 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, 3182 NvmeZoneState state, NvmeRequest *req) 3183 { 3184 return nvme_zrm_finish(ns, zone); 3185 } 3186 3187 static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone, 3188 NvmeZoneState state, NvmeRequest *req) 3189 { 3190 uintptr_t *resets = (uintptr_t *)&req->opaque; 3191 struct nvme_zone_reset_ctx *ctx; 3192 3193 switch (state) { 3194 case NVME_ZONE_STATE_EMPTY: 3195 return NVME_SUCCESS; 3196 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 3197 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 3198 case NVME_ZONE_STATE_CLOSED: 3199 case NVME_ZONE_STATE_FULL: 3200 break; 3201 default: 3202 return NVME_ZONE_INVAL_TRANSITION; 3203 } 3204 3205 /* 3206 * The zone reset aio callback needs to know the zone that is being reset 3207 * in order to transition the zone on completion. 3208 */ 3209 ctx = g_new(struct nvme_zone_reset_ctx, 1); 3210 ctx->req = req; 3211 ctx->zone = zone; 3212 3213 (*resets)++; 3214 3215 blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba), 3216 nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP, 3217 nvme_aio_zone_reset_cb, ctx); 3218 3219 return NVME_NO_COMPLETE; 3220 } 3221 3222 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone, 3223 NvmeZoneState state, NvmeRequest *req) 3224 { 3225 switch (state) { 3226 case NVME_ZONE_STATE_READ_ONLY: 3227 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE); 3228 /* fall through */ 3229 case NVME_ZONE_STATE_OFFLINE: 3230 return NVME_SUCCESS; 3231 default: 3232 return NVME_ZONE_INVAL_TRANSITION; 3233 } 3234 } 3235 3236 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone) 3237 { 3238 uint16_t status; 3239 uint8_t state = nvme_get_zone_state(zone); 3240 3241 if (state == NVME_ZONE_STATE_EMPTY) { 3242 status = nvme_aor_check(ns, 1, 0); 3243 if (status) { 3244 return status; 3245 } 3246 nvme_aor_inc_active(ns); 3247 zone->d.za |= NVME_ZA_ZD_EXT_VALID; 3248 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); 3249 return NVME_SUCCESS; 3250 } 3251 3252 return NVME_ZONE_INVAL_TRANSITION; 3253 } 3254 3255 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, 3256 enum NvmeZoneProcessingMask proc_mask, 3257 op_handler_t op_hndlr, NvmeRequest *req) 3258 { 3259 uint16_t status = NVME_SUCCESS; 3260 NvmeZoneState zs = nvme_get_zone_state(zone); 3261 bool proc_zone; 3262 3263 switch (zs) { 3264 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 3265 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 3266 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES; 3267 break; 3268 case NVME_ZONE_STATE_CLOSED: 3269 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES; 3270 break; 3271 case NVME_ZONE_STATE_READ_ONLY: 3272 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES; 3273 break; 3274 case NVME_ZONE_STATE_FULL: 3275 proc_zone = proc_mask & NVME_PROC_FULL_ZONES; 3276 break; 3277 default: 3278 proc_zone = false; 3279 } 3280 3281 if (proc_zone) { 3282 status = op_hndlr(ns, zone, zs, req); 3283 } 3284 3285 return status; 3286 } 3287 3288 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone, 3289 enum NvmeZoneProcessingMask proc_mask, 3290 op_handler_t op_hndlr, NvmeRequest *req) 3291 { 3292 NvmeZone *next; 3293 uint16_t status = NVME_SUCCESS; 3294 int i; 3295 3296 if (!proc_mask) { 3297 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req); 3298 } else { 3299 if (proc_mask & NVME_PROC_CLOSED_ZONES) { 3300 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { 3301 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3302 req); 3303 if (status && status != NVME_NO_COMPLETE) { 3304 goto out; 3305 } 3306 } 3307 } 3308 if (proc_mask & NVME_PROC_OPENED_ZONES) { 3309 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { 3310 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3311 req); 3312 if (status && status != NVME_NO_COMPLETE) { 3313 goto out; 3314 } 3315 } 3316 3317 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { 3318 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3319 req); 3320 if (status && status != NVME_NO_COMPLETE) { 3321 goto out; 3322 } 3323 } 3324 } 3325 if (proc_mask & NVME_PROC_FULL_ZONES) { 3326 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) { 3327 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3328 req); 3329 if (status && status != NVME_NO_COMPLETE) { 3330 goto out; 3331 } 3332 } 3333 } 3334 3335 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) { 3336 for (i = 0; i < ns->num_zones; i++, zone++) { 3337 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3338 req); 3339 if (status && status != NVME_NO_COMPLETE) { 3340 goto out; 3341 } 3342 } 3343 } 3344 } 3345 3346 out: 3347 return status; 3348 } 3349 3350 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) 3351 { 3352 NvmeCmd *cmd = (NvmeCmd *)&req->cmd; 3353 NvmeNamespace *ns = req->ns; 3354 NvmeZone *zone; 3355 uintptr_t *resets; 3356 uint8_t *zd_ext; 3357 uint32_t dw13 = le32_to_cpu(cmd->cdw13); 3358 uint64_t slba = 0; 3359 uint32_t zone_idx = 0; 3360 uint16_t status; 3361 uint8_t action; 3362 bool all; 3363 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE; 3364 3365 action = dw13 & 0xff; 3366 all = dw13 & 0x100; 3367 3368 req->status = NVME_SUCCESS; 3369 3370 if (!all) { 3371 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx); 3372 if (status) { 3373 return status; 3374 } 3375 } 3376 3377 zone = &ns->zone_array[zone_idx]; 3378 if (slba != zone->d.zslba) { 3379 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba); 3380 return NVME_INVALID_FIELD | NVME_DNR; 3381 } 3382 3383 switch (action) { 3384 3385 case NVME_ZONE_ACTION_OPEN: 3386 if (all) { 3387 proc_mask = NVME_PROC_CLOSED_ZONES; 3388 } 3389 trace_pci_nvme_open_zone(slba, zone_idx, all); 3390 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req); 3391 break; 3392 3393 case NVME_ZONE_ACTION_CLOSE: 3394 if (all) { 3395 proc_mask = NVME_PROC_OPENED_ZONES; 3396 } 3397 trace_pci_nvme_close_zone(slba, zone_idx, all); 3398 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req); 3399 break; 3400 3401 case NVME_ZONE_ACTION_FINISH: 3402 if (all) { 3403 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES; 3404 } 3405 trace_pci_nvme_finish_zone(slba, zone_idx, all); 3406 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req); 3407 break; 3408 3409 case NVME_ZONE_ACTION_RESET: 3410 resets = (uintptr_t *)&req->opaque; 3411 3412 if (all) { 3413 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES | 3414 NVME_PROC_FULL_ZONES; 3415 } 3416 trace_pci_nvme_reset_zone(slba, zone_idx, all); 3417 3418 *resets = 1; 3419 3420 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req); 3421 3422 (*resets)--; 3423 3424 return *resets ? NVME_NO_COMPLETE : req->status; 3425 3426 case NVME_ZONE_ACTION_OFFLINE: 3427 if (all) { 3428 proc_mask = NVME_PROC_READ_ONLY_ZONES; 3429 } 3430 trace_pci_nvme_offline_zone(slba, zone_idx, all); 3431 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req); 3432 break; 3433 3434 case NVME_ZONE_ACTION_SET_ZD_EXT: 3435 trace_pci_nvme_set_descriptor_extension(slba, zone_idx); 3436 if (all || !ns->params.zd_extension_size) { 3437 return NVME_INVALID_FIELD | NVME_DNR; 3438 } 3439 zd_ext = nvme_get_zd_extension(ns, zone_idx); 3440 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req); 3441 if (status) { 3442 trace_pci_nvme_err_zd_extension_map_error(zone_idx); 3443 return status; 3444 } 3445 3446 status = nvme_set_zd_ext(ns, zone); 3447 if (status == NVME_SUCCESS) { 3448 trace_pci_nvme_zd_extension_set(zone_idx); 3449 return status; 3450 } 3451 break; 3452 3453 default: 3454 trace_pci_nvme_err_invalid_mgmt_action(action); 3455 status = NVME_INVALID_FIELD; 3456 } 3457 3458 if (status == NVME_ZONE_INVAL_TRANSITION) { 3459 trace_pci_nvme_err_invalid_zone_state_transition(action, slba, 3460 zone->d.za); 3461 } 3462 if (status) { 3463 status |= NVME_DNR; 3464 } 3465 3466 return status; 3467 } 3468 3469 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl) 3470 { 3471 NvmeZoneState zs = nvme_get_zone_state(zl); 3472 3473 switch (zafs) { 3474 case NVME_ZONE_REPORT_ALL: 3475 return true; 3476 case NVME_ZONE_REPORT_EMPTY: 3477 return zs == NVME_ZONE_STATE_EMPTY; 3478 case NVME_ZONE_REPORT_IMPLICITLY_OPEN: 3479 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN; 3480 case NVME_ZONE_REPORT_EXPLICITLY_OPEN: 3481 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN; 3482 case NVME_ZONE_REPORT_CLOSED: 3483 return zs == NVME_ZONE_STATE_CLOSED; 3484 case NVME_ZONE_REPORT_FULL: 3485 return zs == NVME_ZONE_STATE_FULL; 3486 case NVME_ZONE_REPORT_READ_ONLY: 3487 return zs == NVME_ZONE_STATE_READ_ONLY; 3488 case NVME_ZONE_REPORT_OFFLINE: 3489 return zs == NVME_ZONE_STATE_OFFLINE; 3490 default: 3491 return false; 3492 } 3493 } 3494 3495 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) 3496 { 3497 NvmeCmd *cmd = (NvmeCmd *)&req->cmd; 3498 NvmeNamespace *ns = req->ns; 3499 /* cdw12 is zero-based number of dwords to return. Convert to bytes */ 3500 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2; 3501 uint32_t dw13 = le32_to_cpu(cmd->cdw13); 3502 uint32_t zone_idx, zra, zrasf, partial; 3503 uint64_t max_zones, nr_zones = 0; 3504 uint16_t status; 3505 uint64_t slba; 3506 NvmeZoneDescr *z; 3507 NvmeZone *zone; 3508 NvmeZoneReportHeader *header; 3509 void *buf, *buf_p; 3510 size_t zone_entry_sz; 3511 int i; 3512 3513 req->status = NVME_SUCCESS; 3514 3515 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx); 3516 if (status) { 3517 return status; 3518 } 3519 3520 zra = dw13 & 0xff; 3521 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) { 3522 return NVME_INVALID_FIELD | NVME_DNR; 3523 } 3524 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) { 3525 return NVME_INVALID_FIELD | NVME_DNR; 3526 } 3527 3528 zrasf = (dw13 >> 8) & 0xff; 3529 if (zrasf > NVME_ZONE_REPORT_OFFLINE) { 3530 return NVME_INVALID_FIELD | NVME_DNR; 3531 } 3532 3533 if (data_size < sizeof(NvmeZoneReportHeader)) { 3534 return NVME_INVALID_FIELD | NVME_DNR; 3535 } 3536 3537 status = nvme_check_mdts(n, data_size); 3538 if (status) { 3539 return status; 3540 } 3541 3542 partial = (dw13 >> 16) & 0x01; 3543 3544 zone_entry_sz = sizeof(NvmeZoneDescr); 3545 if (zra == NVME_ZONE_REPORT_EXTENDED) { 3546 zone_entry_sz += ns->params.zd_extension_size; 3547 } 3548 3549 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz; 3550 buf = g_malloc0(data_size); 3551 3552 zone = &ns->zone_array[zone_idx]; 3553 for (i = zone_idx; i < ns->num_zones; i++) { 3554 if (partial && nr_zones >= max_zones) { 3555 break; 3556 } 3557 if (nvme_zone_matches_filter(zrasf, zone++)) { 3558 nr_zones++; 3559 } 3560 } 3561 header = (NvmeZoneReportHeader *)buf; 3562 header->nr_zones = cpu_to_le64(nr_zones); 3563 3564 buf_p = buf + sizeof(NvmeZoneReportHeader); 3565 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) { 3566 zone = &ns->zone_array[zone_idx]; 3567 if (nvme_zone_matches_filter(zrasf, zone)) { 3568 z = (NvmeZoneDescr *)buf_p; 3569 buf_p += sizeof(NvmeZoneDescr); 3570 3571 z->zt = zone->d.zt; 3572 z->zs = zone->d.zs; 3573 z->zcap = cpu_to_le64(zone->d.zcap); 3574 z->zslba = cpu_to_le64(zone->d.zslba); 3575 z->za = zone->d.za; 3576 3577 if (nvme_wp_is_valid(zone)) { 3578 z->wp = cpu_to_le64(zone->d.wp); 3579 } else { 3580 z->wp = cpu_to_le64(~0ULL); 3581 } 3582 3583 if (zra == NVME_ZONE_REPORT_EXTENDED) { 3584 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) { 3585 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx), 3586 ns->params.zd_extension_size); 3587 } 3588 buf_p += ns->params.zd_extension_size; 3589 } 3590 3591 max_zones--; 3592 } 3593 } 3594 3595 status = nvme_c2h(n, (uint8_t *)buf, data_size, req); 3596 3597 g_free(buf); 3598 3599 return status; 3600 } 3601 3602 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) 3603 { 3604 NvmeNamespace *ns; 3605 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 3606 3607 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), 3608 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode)); 3609 3610 if (!nvme_nsid_valid(n, nsid)) { 3611 return NVME_INVALID_NSID | NVME_DNR; 3612 } 3613 3614 /* 3615 * In the base NVM command set, Flush may apply to all namespaces 3616 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used 3617 * along with TP 4056 (Namespace Types), it may be pretty screwed up. 3618 * 3619 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the 3620 * opcode with a specific command since we cannot determine a unique I/O 3621 * command set. Opcode 0h could have any other meaning than something 3622 * equivalent to flushing and say it DOES have completely different 3623 * semantics in some other command set - does an NSID of FFFFFFFFh then 3624 * mean "for all namespaces, apply whatever command set specific command 3625 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply 3626 * whatever command that uses the 0h opcode if, and only if, it allows NSID 3627 * to be FFFFFFFFh"? 3628 * 3629 * Anyway (and luckily), for now, we do not care about this since the 3630 * device only supports namespace types that includes the NVM Flush command 3631 * (NVM and Zoned), so always do an NVM Flush. 3632 */ 3633 if (req->cmd.opcode == NVME_CMD_FLUSH) { 3634 return nvme_flush(n, req); 3635 } 3636 3637 ns = nvme_ns(n, nsid); 3638 if (unlikely(!ns)) { 3639 return NVME_INVALID_FIELD | NVME_DNR; 3640 } 3641 3642 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { 3643 trace_pci_nvme_err_invalid_opc(req->cmd.opcode); 3644 return NVME_INVALID_OPCODE | NVME_DNR; 3645 } 3646 3647 if (ns->status) { 3648 return ns->status; 3649 } 3650 3651 req->ns = ns; 3652 3653 switch (req->cmd.opcode) { 3654 case NVME_CMD_WRITE_ZEROES: 3655 return nvme_write_zeroes(n, req); 3656 case NVME_CMD_ZONE_APPEND: 3657 return nvme_zone_append(n, req); 3658 case NVME_CMD_WRITE: 3659 return nvme_write(n, req); 3660 case NVME_CMD_READ: 3661 return nvme_read(n, req); 3662 case NVME_CMD_COMPARE: 3663 return nvme_compare(n, req); 3664 case NVME_CMD_DSM: 3665 return nvme_dsm(n, req); 3666 case NVME_CMD_VERIFY: 3667 return nvme_verify(n, req); 3668 case NVME_CMD_COPY: 3669 return nvme_copy(n, req); 3670 case NVME_CMD_ZONE_MGMT_SEND: 3671 return nvme_zone_mgmt_send(n, req); 3672 case NVME_CMD_ZONE_MGMT_RECV: 3673 return nvme_zone_mgmt_recv(n, req); 3674 default: 3675 assert(false); 3676 } 3677 3678 return NVME_INVALID_OPCODE | NVME_DNR; 3679 } 3680 3681 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) 3682 { 3683 n->sq[sq->sqid] = NULL; 3684 timer_free(sq->timer); 3685 g_free(sq->io_req); 3686 if (sq->sqid) { 3687 g_free(sq); 3688 } 3689 } 3690 3691 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req) 3692 { 3693 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd; 3694 NvmeRequest *r, *next; 3695 NvmeSQueue *sq; 3696 NvmeCQueue *cq; 3697 uint16_t qid = le16_to_cpu(c->qid); 3698 uint32_t nsid; 3699 3700 if (unlikely(!qid || nvme_check_sqid(n, qid))) { 3701 trace_pci_nvme_err_invalid_del_sq(qid); 3702 return NVME_INVALID_QID | NVME_DNR; 3703 } 3704 3705 trace_pci_nvme_del_sq(qid); 3706 3707 sq = n->sq[qid]; 3708 while (!QTAILQ_EMPTY(&sq->out_req_list)) { 3709 r = QTAILQ_FIRST(&sq->out_req_list); 3710 if (r->aiocb) { 3711 blk_aio_cancel(r->aiocb); 3712 } 3713 } 3714 3715 /* 3716 * Drain all namespaces if there are still outstanding requests that we 3717 * could not cancel explicitly. 3718 */ 3719 if (!QTAILQ_EMPTY(&sq->out_req_list)) { 3720 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) { 3721 NvmeNamespace *ns = nvme_ns(n, nsid); 3722 if (ns) { 3723 nvme_ns_drain(ns); 3724 } 3725 } 3726 } 3727 3728 assert(QTAILQ_EMPTY(&sq->out_req_list)); 3729 3730 if (!nvme_check_cqid(n, sq->cqid)) { 3731 cq = n->cq[sq->cqid]; 3732 QTAILQ_REMOVE(&cq->sq_list, sq, entry); 3733 3734 nvme_post_cqes(cq); 3735 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) { 3736 if (r->sq == sq) { 3737 QTAILQ_REMOVE(&cq->req_list, r, entry); 3738 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry); 3739 } 3740 } 3741 } 3742 3743 nvme_free_sq(sq, n); 3744 return NVME_SUCCESS; 3745 } 3746 3747 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, 3748 uint16_t sqid, uint16_t cqid, uint16_t size) 3749 { 3750 int i; 3751 NvmeCQueue *cq; 3752 3753 sq->ctrl = n; 3754 sq->dma_addr = dma_addr; 3755 sq->sqid = sqid; 3756 sq->size = size; 3757 sq->cqid = cqid; 3758 sq->head = sq->tail = 0; 3759 sq->io_req = g_new0(NvmeRequest, sq->size); 3760 3761 QTAILQ_INIT(&sq->req_list); 3762 QTAILQ_INIT(&sq->out_req_list); 3763 for (i = 0; i < sq->size; i++) { 3764 sq->io_req[i].sq = sq; 3765 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry); 3766 } 3767 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq); 3768 3769 assert(n->cq[cqid]); 3770 cq = n->cq[cqid]; 3771 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry); 3772 n->sq[sqid] = sq; 3773 } 3774 3775 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req) 3776 { 3777 NvmeSQueue *sq; 3778 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd; 3779 3780 uint16_t cqid = le16_to_cpu(c->cqid); 3781 uint16_t sqid = le16_to_cpu(c->sqid); 3782 uint16_t qsize = le16_to_cpu(c->qsize); 3783 uint16_t qflags = le16_to_cpu(c->sq_flags); 3784 uint64_t prp1 = le64_to_cpu(c->prp1); 3785 3786 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags); 3787 3788 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) { 3789 trace_pci_nvme_err_invalid_create_sq_cqid(cqid); 3790 return NVME_INVALID_CQID | NVME_DNR; 3791 } 3792 if (unlikely(!sqid || sqid > n->params.max_ioqpairs || 3793 n->sq[sqid] != NULL)) { 3794 trace_pci_nvme_err_invalid_create_sq_sqid(sqid); 3795 return NVME_INVALID_QID | NVME_DNR; 3796 } 3797 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) { 3798 trace_pci_nvme_err_invalid_create_sq_size(qsize); 3799 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; 3800 } 3801 if (unlikely(prp1 & (n->page_size - 1))) { 3802 trace_pci_nvme_err_invalid_create_sq_addr(prp1); 3803 return NVME_INVALID_PRP_OFFSET | NVME_DNR; 3804 } 3805 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) { 3806 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags)); 3807 return NVME_INVALID_FIELD | NVME_DNR; 3808 } 3809 sq = g_malloc0(sizeof(*sq)); 3810 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1); 3811 return NVME_SUCCESS; 3812 } 3813 3814 struct nvme_stats { 3815 uint64_t units_read; 3816 uint64_t units_written; 3817 uint64_t read_commands; 3818 uint64_t write_commands; 3819 }; 3820 3821 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats) 3822 { 3823 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk); 3824 3825 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS; 3826 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS; 3827 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ]; 3828 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE]; 3829 } 3830 3831 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, 3832 uint64_t off, NvmeRequest *req) 3833 { 3834 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 3835 struct nvme_stats stats = { 0 }; 3836 NvmeSmartLog smart = { 0 }; 3837 uint32_t trans_len; 3838 NvmeNamespace *ns; 3839 time_t current_ms; 3840 3841 if (off >= sizeof(smart)) { 3842 return NVME_INVALID_FIELD | NVME_DNR; 3843 } 3844 3845 if (nsid != 0xffffffff) { 3846 ns = nvme_ns(n, nsid); 3847 if (!ns) { 3848 return NVME_INVALID_NSID | NVME_DNR; 3849 } 3850 nvme_set_blk_stats(ns, &stats); 3851 } else { 3852 int i; 3853 3854 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 3855 ns = nvme_ns(n, i); 3856 if (!ns) { 3857 continue; 3858 } 3859 nvme_set_blk_stats(ns, &stats); 3860 } 3861 } 3862 3863 trans_len = MIN(sizeof(smart) - off, buf_len); 3864 smart.critical_warning = n->smart_critical_warning; 3865 3866 smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read, 3867 1000)); 3868 smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written, 3869 1000)); 3870 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands); 3871 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands); 3872 3873 smart.temperature = cpu_to_le16(n->temperature); 3874 3875 if ((n->temperature >= n->features.temp_thresh_hi) || 3876 (n->temperature <= n->features.temp_thresh_low)) { 3877 smart.critical_warning |= NVME_SMART_TEMPERATURE; 3878 } 3879 3880 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 3881 smart.power_on_hours[0] = 3882 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60); 3883 3884 if (!rae) { 3885 nvme_clear_events(n, NVME_AER_TYPE_SMART); 3886 } 3887 3888 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req); 3889 } 3890 3891 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, 3892 NvmeRequest *req) 3893 { 3894 uint32_t trans_len; 3895 NvmeFwSlotInfoLog fw_log = { 3896 .afi = 0x1, 3897 }; 3898 3899 if (off >= sizeof(fw_log)) { 3900 return NVME_INVALID_FIELD | NVME_DNR; 3901 } 3902 3903 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' '); 3904 trans_len = MIN(sizeof(fw_log) - off, buf_len); 3905 3906 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req); 3907 } 3908 3909 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, 3910 uint64_t off, NvmeRequest *req) 3911 { 3912 uint32_t trans_len; 3913 NvmeErrorLog errlog; 3914 3915 if (off >= sizeof(errlog)) { 3916 return NVME_INVALID_FIELD | NVME_DNR; 3917 } 3918 3919 if (!rae) { 3920 nvme_clear_events(n, NVME_AER_TYPE_ERROR); 3921 } 3922 3923 memset(&errlog, 0x0, sizeof(errlog)); 3924 trans_len = MIN(sizeof(errlog) - off, buf_len); 3925 3926 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req); 3927 } 3928 3929 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, 3930 uint64_t off, NvmeRequest *req) 3931 { 3932 uint32_t nslist[1024]; 3933 uint32_t trans_len; 3934 int i = 0; 3935 uint32_t nsid; 3936 3937 memset(nslist, 0x0, sizeof(nslist)); 3938 trans_len = MIN(sizeof(nslist) - off, buf_len); 3939 3940 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) != 3941 NVME_CHANGED_NSID_SIZE) { 3942 /* 3943 * If more than 1024 namespaces, the first entry in the log page should 3944 * be set to FFFFFFFFh and the others to 0 as spec. 3945 */ 3946 if (i == ARRAY_SIZE(nslist)) { 3947 memset(nslist, 0x0, sizeof(nslist)); 3948 nslist[0] = 0xffffffff; 3949 break; 3950 } 3951 3952 nslist[i++] = nsid; 3953 clear_bit(nsid, n->changed_nsids); 3954 } 3955 3956 /* 3957 * Remove all the remaining list entries in case returns directly due to 3958 * more than 1024 namespaces. 3959 */ 3960 if (nslist[0] == 0xffffffff) { 3961 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE); 3962 } 3963 3964 if (!rae) { 3965 nvme_clear_events(n, NVME_AER_TYPE_NOTICE); 3966 } 3967 3968 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req); 3969 } 3970 3971 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, 3972 uint64_t off, NvmeRequest *req) 3973 { 3974 NvmeEffectsLog log = {}; 3975 const uint32_t *src_iocs = NULL; 3976 uint32_t trans_len; 3977 3978 if (off >= sizeof(log)) { 3979 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log)); 3980 return NVME_INVALID_FIELD | NVME_DNR; 3981 } 3982 3983 switch (NVME_CC_CSS(n->bar.cc)) { 3984 case NVME_CC_CSS_NVM: 3985 src_iocs = nvme_cse_iocs_nvm; 3986 /* fall through */ 3987 case NVME_CC_CSS_ADMIN_ONLY: 3988 break; 3989 case NVME_CC_CSS_CSI: 3990 switch (csi) { 3991 case NVME_CSI_NVM: 3992 src_iocs = nvme_cse_iocs_nvm; 3993 break; 3994 case NVME_CSI_ZONED: 3995 src_iocs = nvme_cse_iocs_zoned; 3996 break; 3997 } 3998 } 3999 4000 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs)); 4001 4002 if (src_iocs) { 4003 memcpy(log.iocs, src_iocs, sizeof(log.iocs)); 4004 } 4005 4006 trans_len = MIN(sizeof(log) - off, buf_len); 4007 4008 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req); 4009 } 4010 4011 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) 4012 { 4013 NvmeCmd *cmd = &req->cmd; 4014 4015 uint32_t dw10 = le32_to_cpu(cmd->cdw10); 4016 uint32_t dw11 = le32_to_cpu(cmd->cdw11); 4017 uint32_t dw12 = le32_to_cpu(cmd->cdw12); 4018 uint32_t dw13 = le32_to_cpu(cmd->cdw13); 4019 uint8_t lid = dw10 & 0xff; 4020 uint8_t lsp = (dw10 >> 8) & 0xf; 4021 uint8_t rae = (dw10 >> 15) & 0x1; 4022 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24; 4023 uint32_t numdl, numdu; 4024 uint64_t off, lpol, lpou; 4025 size_t len; 4026 uint16_t status; 4027 4028 numdl = (dw10 >> 16); 4029 numdu = (dw11 & 0xffff); 4030 lpol = dw12; 4031 lpou = dw13; 4032 4033 len = (((numdu << 16) | numdl) + 1) << 2; 4034 off = (lpou << 32ULL) | lpol; 4035 4036 if (off & 0x3) { 4037 return NVME_INVALID_FIELD | NVME_DNR; 4038 } 4039 4040 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off); 4041 4042 status = nvme_check_mdts(n, len); 4043 if (status) { 4044 return status; 4045 } 4046 4047 switch (lid) { 4048 case NVME_LOG_ERROR_INFO: 4049 return nvme_error_info(n, rae, len, off, req); 4050 case NVME_LOG_SMART_INFO: 4051 return nvme_smart_info(n, rae, len, off, req); 4052 case NVME_LOG_FW_SLOT_INFO: 4053 return nvme_fw_log_info(n, len, off, req); 4054 case NVME_LOG_CHANGED_NSLIST: 4055 return nvme_changed_nslist(n, rae, len, off, req); 4056 case NVME_LOG_CMD_EFFECTS: 4057 return nvme_cmd_effects(n, csi, len, off, req); 4058 default: 4059 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid); 4060 return NVME_INVALID_FIELD | NVME_DNR; 4061 } 4062 } 4063 4064 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) 4065 { 4066 n->cq[cq->cqid] = NULL; 4067 timer_free(cq->timer); 4068 if (msix_enabled(&n->parent_obj)) { 4069 msix_vector_unuse(&n->parent_obj, cq->vector); 4070 } 4071 if (cq->cqid) { 4072 g_free(cq); 4073 } 4074 } 4075 4076 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req) 4077 { 4078 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd; 4079 NvmeCQueue *cq; 4080 uint16_t qid = le16_to_cpu(c->qid); 4081 4082 if (unlikely(!qid || nvme_check_cqid(n, qid))) { 4083 trace_pci_nvme_err_invalid_del_cq_cqid(qid); 4084 return NVME_INVALID_CQID | NVME_DNR; 4085 } 4086 4087 cq = n->cq[qid]; 4088 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) { 4089 trace_pci_nvme_err_invalid_del_cq_notempty(qid); 4090 return NVME_INVALID_QUEUE_DEL; 4091 } 4092 nvme_irq_deassert(n, cq); 4093 trace_pci_nvme_del_cq(qid); 4094 nvme_free_cq(cq, n); 4095 return NVME_SUCCESS; 4096 } 4097 4098 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, 4099 uint16_t cqid, uint16_t vector, uint16_t size, 4100 uint16_t irq_enabled) 4101 { 4102 int ret; 4103 4104 if (msix_enabled(&n->parent_obj)) { 4105 ret = msix_vector_use(&n->parent_obj, vector); 4106 assert(ret == 0); 4107 } 4108 cq->ctrl = n; 4109 cq->cqid = cqid; 4110 cq->size = size; 4111 cq->dma_addr = dma_addr; 4112 cq->phase = 1; 4113 cq->irq_enabled = irq_enabled; 4114 cq->vector = vector; 4115 cq->head = cq->tail = 0; 4116 QTAILQ_INIT(&cq->req_list); 4117 QTAILQ_INIT(&cq->sq_list); 4118 n->cq[cqid] = cq; 4119 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); 4120 } 4121 4122 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) 4123 { 4124 NvmeCQueue *cq; 4125 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd; 4126 uint16_t cqid = le16_to_cpu(c->cqid); 4127 uint16_t vector = le16_to_cpu(c->irq_vector); 4128 uint16_t qsize = le16_to_cpu(c->qsize); 4129 uint16_t qflags = le16_to_cpu(c->cq_flags); 4130 uint64_t prp1 = le64_to_cpu(c->prp1); 4131 4132 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags, 4133 NVME_CQ_FLAGS_IEN(qflags) != 0); 4134 4135 if (unlikely(!cqid || cqid > n->params.max_ioqpairs || 4136 n->cq[cqid] != NULL)) { 4137 trace_pci_nvme_err_invalid_create_cq_cqid(cqid); 4138 return NVME_INVALID_QID | NVME_DNR; 4139 } 4140 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) { 4141 trace_pci_nvme_err_invalid_create_cq_size(qsize); 4142 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; 4143 } 4144 if (unlikely(prp1 & (n->page_size - 1))) { 4145 trace_pci_nvme_err_invalid_create_cq_addr(prp1); 4146 return NVME_INVALID_PRP_OFFSET | NVME_DNR; 4147 } 4148 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) { 4149 trace_pci_nvme_err_invalid_create_cq_vector(vector); 4150 return NVME_INVALID_IRQ_VECTOR | NVME_DNR; 4151 } 4152 if (unlikely(vector >= n->params.msix_qsize)) { 4153 trace_pci_nvme_err_invalid_create_cq_vector(vector); 4154 return NVME_INVALID_IRQ_VECTOR | NVME_DNR; 4155 } 4156 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) { 4157 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags)); 4158 return NVME_INVALID_FIELD | NVME_DNR; 4159 } 4160 4161 cq = g_malloc0(sizeof(*cq)); 4162 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1, 4163 NVME_CQ_FLAGS_IEN(qflags)); 4164 4165 /* 4166 * It is only required to set qs_created when creating a completion queue; 4167 * creating a submission queue without a matching completion queue will 4168 * fail. 4169 */ 4170 n->qs_created = true; 4171 return NVME_SUCCESS; 4172 } 4173 4174 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req) 4175 { 4176 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; 4177 4178 return nvme_c2h(n, id, sizeof(id), req); 4179 } 4180 4181 static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns) 4182 { 4183 switch (ns->csi) { 4184 case NVME_CSI_NVM: 4185 case NVME_CSI_ZONED: 4186 return true; 4187 } 4188 return false; 4189 } 4190 4191 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) 4192 { 4193 trace_pci_nvme_identify_ctrl(); 4194 4195 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req); 4196 } 4197 4198 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) 4199 { 4200 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4201 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; 4202 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id; 4203 4204 trace_pci_nvme_identify_ctrl_csi(c->csi); 4205 4206 switch (c->csi) { 4207 case NVME_CSI_NVM: 4208 id_nvm->vsl = n->params.vsl; 4209 id_nvm->dmrsl = cpu_to_le32(n->dmrsl); 4210 break; 4211 4212 case NVME_CSI_ZONED: 4213 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl; 4214 break; 4215 4216 default: 4217 return NVME_INVALID_FIELD | NVME_DNR; 4218 } 4219 4220 return nvme_c2h(n, id, sizeof(id), req); 4221 } 4222 4223 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active) 4224 { 4225 NvmeNamespace *ns; 4226 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4227 uint32_t nsid = le32_to_cpu(c->nsid); 4228 4229 trace_pci_nvme_identify_ns(nsid); 4230 4231 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 4232 return NVME_INVALID_NSID | NVME_DNR; 4233 } 4234 4235 ns = nvme_ns(n, nsid); 4236 if (unlikely(!ns)) { 4237 if (!active) { 4238 ns = nvme_subsys_ns(n->subsys, nsid); 4239 if (!ns) { 4240 return nvme_rpt_empty_id_struct(n, req); 4241 } 4242 } else { 4243 return nvme_rpt_empty_id_struct(n, req); 4244 } 4245 } 4246 4247 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { 4248 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req); 4249 } 4250 4251 return NVME_INVALID_CMD_SET | NVME_DNR; 4252 } 4253 4254 static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req) 4255 { 4256 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4257 uint16_t min_id = le16_to_cpu(c->ctrlid); 4258 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; 4259 uint16_t *ids = &list[1]; 4260 NvmeNamespace *ns; 4261 NvmeCtrl *ctrl; 4262 int cntlid, nr_ids = 0; 4263 4264 trace_pci_nvme_identify_ns_attached_list(min_id); 4265 4266 if (c->nsid == NVME_NSID_BROADCAST) { 4267 return NVME_INVALID_FIELD | NVME_DNR; 4268 } 4269 4270 ns = nvme_subsys_ns(n->subsys, c->nsid); 4271 if (!ns) { 4272 return NVME_INVALID_FIELD | NVME_DNR; 4273 } 4274 4275 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) { 4276 ctrl = nvme_subsys_ctrl(n->subsys, cntlid); 4277 if (!ctrl) { 4278 continue; 4279 } 4280 4281 if (!nvme_ns(ctrl, c->nsid)) { 4282 continue; 4283 } 4284 4285 ids[nr_ids++] = cntlid; 4286 } 4287 4288 list[0] = nr_ids; 4289 4290 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req); 4291 } 4292 4293 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req, 4294 bool active) 4295 { 4296 NvmeNamespace *ns; 4297 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4298 uint32_t nsid = le32_to_cpu(c->nsid); 4299 4300 trace_pci_nvme_identify_ns_csi(nsid, c->csi); 4301 4302 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 4303 return NVME_INVALID_NSID | NVME_DNR; 4304 } 4305 4306 ns = nvme_ns(n, nsid); 4307 if (unlikely(!ns)) { 4308 if (!active) { 4309 ns = nvme_subsys_ns(n->subsys, nsid); 4310 if (!ns) { 4311 return nvme_rpt_empty_id_struct(n, req); 4312 } 4313 } else { 4314 return nvme_rpt_empty_id_struct(n, req); 4315 } 4316 } 4317 4318 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { 4319 return nvme_rpt_empty_id_struct(n, req); 4320 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) { 4321 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned), 4322 req); 4323 } 4324 4325 return NVME_INVALID_FIELD | NVME_DNR; 4326 } 4327 4328 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req, 4329 bool active) 4330 { 4331 NvmeNamespace *ns; 4332 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4333 uint32_t min_nsid = le32_to_cpu(c->nsid); 4334 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 4335 static const int data_len = sizeof(list); 4336 uint32_t *list_ptr = (uint32_t *)list; 4337 int i, j = 0; 4338 4339 trace_pci_nvme_identify_nslist(min_nsid); 4340 4341 /* 4342 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values 4343 * since the Active Namespace ID List should return namespaces with ids 4344 * *higher* than the NSID specified in the command. This is also specified 4345 * in the spec (NVM Express v1.3d, Section 5.15.4). 4346 */ 4347 if (min_nsid >= NVME_NSID_BROADCAST - 1) { 4348 return NVME_INVALID_NSID | NVME_DNR; 4349 } 4350 4351 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4352 ns = nvme_ns(n, i); 4353 if (!ns) { 4354 if (!active) { 4355 ns = nvme_subsys_ns(n->subsys, i); 4356 if (!ns) { 4357 continue; 4358 } 4359 } else { 4360 continue; 4361 } 4362 } 4363 if (ns->params.nsid <= min_nsid) { 4364 continue; 4365 } 4366 list_ptr[j++] = cpu_to_le32(ns->params.nsid); 4367 if (j == data_len / sizeof(uint32_t)) { 4368 break; 4369 } 4370 } 4371 4372 return nvme_c2h(n, list, data_len, req); 4373 } 4374 4375 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req, 4376 bool active) 4377 { 4378 NvmeNamespace *ns; 4379 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4380 uint32_t min_nsid = le32_to_cpu(c->nsid); 4381 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 4382 static const int data_len = sizeof(list); 4383 uint32_t *list_ptr = (uint32_t *)list; 4384 int i, j = 0; 4385 4386 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi); 4387 4388 /* 4389 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid. 4390 */ 4391 if (min_nsid >= NVME_NSID_BROADCAST - 1) { 4392 return NVME_INVALID_NSID | NVME_DNR; 4393 } 4394 4395 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) { 4396 return NVME_INVALID_FIELD | NVME_DNR; 4397 } 4398 4399 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4400 ns = nvme_ns(n, i); 4401 if (!ns) { 4402 if (!active) { 4403 ns = nvme_subsys_ns(n->subsys, i); 4404 if (!ns) { 4405 continue; 4406 } 4407 } else { 4408 continue; 4409 } 4410 } 4411 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) { 4412 continue; 4413 } 4414 list_ptr[j++] = cpu_to_le32(ns->params.nsid); 4415 if (j == data_len / sizeof(uint32_t)) { 4416 break; 4417 } 4418 } 4419 4420 return nvme_c2h(n, list, data_len, req); 4421 } 4422 4423 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) 4424 { 4425 NvmeNamespace *ns; 4426 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4427 uint32_t nsid = le32_to_cpu(c->nsid); 4428 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 4429 4430 struct data { 4431 struct { 4432 NvmeIdNsDescr hdr; 4433 uint8_t v[NVME_NIDL_UUID]; 4434 } uuid; 4435 struct { 4436 NvmeIdNsDescr hdr; 4437 uint8_t v; 4438 } csi; 4439 }; 4440 4441 struct data *ns_descrs = (struct data *)list; 4442 4443 trace_pci_nvme_identify_ns_descr_list(nsid); 4444 4445 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 4446 return NVME_INVALID_NSID | NVME_DNR; 4447 } 4448 4449 ns = nvme_ns(n, nsid); 4450 if (unlikely(!ns)) { 4451 return NVME_INVALID_FIELD | NVME_DNR; 4452 } 4453 4454 /* 4455 * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data 4456 * structure, a Namespace UUID (nidt = 3h) must be reported in the 4457 * Namespace Identification Descriptor. Add the namespace UUID here. 4458 */ 4459 ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID; 4460 ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID; 4461 memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDL_UUID); 4462 4463 ns_descrs->csi.hdr.nidt = NVME_NIDT_CSI; 4464 ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI; 4465 ns_descrs->csi.v = ns->csi; 4466 4467 return nvme_c2h(n, list, sizeof(list), req); 4468 } 4469 4470 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req) 4471 { 4472 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 4473 static const int data_len = sizeof(list); 4474 4475 trace_pci_nvme_identify_cmd_set(); 4476 4477 NVME_SET_CSI(*list, NVME_CSI_NVM); 4478 NVME_SET_CSI(*list, NVME_CSI_ZONED); 4479 4480 return nvme_c2h(n, list, data_len, req); 4481 } 4482 4483 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) 4484 { 4485 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4486 4487 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid), 4488 c->csi); 4489 4490 switch (c->cns) { 4491 case NVME_ID_CNS_NS: 4492 return nvme_identify_ns(n, req, true); 4493 case NVME_ID_CNS_NS_PRESENT: 4494 return nvme_identify_ns(n, req, false); 4495 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST: 4496 return nvme_identify_ns_attached_list(n, req); 4497 case NVME_ID_CNS_CS_NS: 4498 return nvme_identify_ns_csi(n, req, true); 4499 case NVME_ID_CNS_CS_NS_PRESENT: 4500 return nvme_identify_ns_csi(n, req, false); 4501 case NVME_ID_CNS_CTRL: 4502 return nvme_identify_ctrl(n, req); 4503 case NVME_ID_CNS_CS_CTRL: 4504 return nvme_identify_ctrl_csi(n, req); 4505 case NVME_ID_CNS_NS_ACTIVE_LIST: 4506 return nvme_identify_nslist(n, req, true); 4507 case NVME_ID_CNS_NS_PRESENT_LIST: 4508 return nvme_identify_nslist(n, req, false); 4509 case NVME_ID_CNS_CS_NS_ACTIVE_LIST: 4510 return nvme_identify_nslist_csi(n, req, true); 4511 case NVME_ID_CNS_CS_NS_PRESENT_LIST: 4512 return nvme_identify_nslist_csi(n, req, false); 4513 case NVME_ID_CNS_NS_DESCR_LIST: 4514 return nvme_identify_ns_descr_list(n, req); 4515 case NVME_ID_CNS_IO_COMMAND_SET: 4516 return nvme_identify_cmd_set(n, req); 4517 default: 4518 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns)); 4519 return NVME_INVALID_FIELD | NVME_DNR; 4520 } 4521 } 4522 4523 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req) 4524 { 4525 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff; 4526 4527 req->cqe.result = 1; 4528 if (nvme_check_sqid(n, sqid)) { 4529 return NVME_INVALID_FIELD | NVME_DNR; 4530 } 4531 4532 return NVME_SUCCESS; 4533 } 4534 4535 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts) 4536 { 4537 trace_pci_nvme_setfeat_timestamp(ts); 4538 4539 n->host_timestamp = le64_to_cpu(ts); 4540 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 4541 } 4542 4543 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n) 4544 { 4545 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 4546 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms; 4547 4548 union nvme_timestamp { 4549 struct { 4550 uint64_t timestamp:48; 4551 uint64_t sync:1; 4552 uint64_t origin:3; 4553 uint64_t rsvd1:12; 4554 }; 4555 uint64_t all; 4556 }; 4557 4558 union nvme_timestamp ts; 4559 ts.all = 0; 4560 ts.timestamp = n->host_timestamp + elapsed_time; 4561 4562 /* If the host timestamp is non-zero, set the timestamp origin */ 4563 ts.origin = n->host_timestamp ? 0x01 : 0x00; 4564 4565 trace_pci_nvme_getfeat_timestamp(ts.all); 4566 4567 return cpu_to_le64(ts.all); 4568 } 4569 4570 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) 4571 { 4572 uint64_t timestamp = nvme_get_timestamp(n); 4573 4574 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req); 4575 } 4576 4577 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) 4578 { 4579 NvmeCmd *cmd = &req->cmd; 4580 uint32_t dw10 = le32_to_cpu(cmd->cdw10); 4581 uint32_t dw11 = le32_to_cpu(cmd->cdw11); 4582 uint32_t nsid = le32_to_cpu(cmd->nsid); 4583 uint32_t result; 4584 uint8_t fid = NVME_GETSETFEAT_FID(dw10); 4585 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10); 4586 uint16_t iv; 4587 NvmeNamespace *ns; 4588 int i; 4589 4590 static const uint32_t nvme_feature_default[NVME_FID_MAX] = { 4591 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT, 4592 }; 4593 4594 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11); 4595 4596 if (!nvme_feature_support[fid]) { 4597 return NVME_INVALID_FIELD | NVME_DNR; 4598 } 4599 4600 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) { 4601 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 4602 /* 4603 * The Reservation Notification Mask and Reservation Persistence 4604 * features require a status code of Invalid Field in Command when 4605 * NSID is FFFFFFFFh. Since the device does not support those 4606 * features we can always return Invalid Namespace or Format as we 4607 * should do for all other features. 4608 */ 4609 return NVME_INVALID_NSID | NVME_DNR; 4610 } 4611 4612 if (!nvme_ns(n, nsid)) { 4613 return NVME_INVALID_FIELD | NVME_DNR; 4614 } 4615 } 4616 4617 switch (sel) { 4618 case NVME_GETFEAT_SELECT_CURRENT: 4619 break; 4620 case NVME_GETFEAT_SELECT_SAVED: 4621 /* no features are saveable by the controller; fallthrough */ 4622 case NVME_GETFEAT_SELECT_DEFAULT: 4623 goto defaults; 4624 case NVME_GETFEAT_SELECT_CAP: 4625 result = nvme_feature_cap[fid]; 4626 goto out; 4627 } 4628 4629 switch (fid) { 4630 case NVME_TEMPERATURE_THRESHOLD: 4631 result = 0; 4632 4633 /* 4634 * The controller only implements the Composite Temperature sensor, so 4635 * return 0 for all other sensors. 4636 */ 4637 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { 4638 goto out; 4639 } 4640 4641 switch (NVME_TEMP_THSEL(dw11)) { 4642 case NVME_TEMP_THSEL_OVER: 4643 result = n->features.temp_thresh_hi; 4644 goto out; 4645 case NVME_TEMP_THSEL_UNDER: 4646 result = n->features.temp_thresh_low; 4647 goto out; 4648 } 4649 4650 return NVME_INVALID_FIELD | NVME_DNR; 4651 case NVME_ERROR_RECOVERY: 4652 if (!nvme_nsid_valid(n, nsid)) { 4653 return NVME_INVALID_NSID | NVME_DNR; 4654 } 4655 4656 ns = nvme_ns(n, nsid); 4657 if (unlikely(!ns)) { 4658 return NVME_INVALID_FIELD | NVME_DNR; 4659 } 4660 4661 result = ns->features.err_rec; 4662 goto out; 4663 case NVME_VOLATILE_WRITE_CACHE: 4664 result = 0; 4665 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4666 ns = nvme_ns(n, i); 4667 if (!ns) { 4668 continue; 4669 } 4670 4671 result = blk_enable_write_cache(ns->blkconf.blk); 4672 if (result) { 4673 break; 4674 } 4675 } 4676 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled"); 4677 goto out; 4678 case NVME_ASYNCHRONOUS_EVENT_CONF: 4679 result = n->features.async_config; 4680 goto out; 4681 case NVME_TIMESTAMP: 4682 return nvme_get_feature_timestamp(n, req); 4683 default: 4684 break; 4685 } 4686 4687 defaults: 4688 switch (fid) { 4689 case NVME_TEMPERATURE_THRESHOLD: 4690 result = 0; 4691 4692 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { 4693 break; 4694 } 4695 4696 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) { 4697 result = NVME_TEMPERATURE_WARNING; 4698 } 4699 4700 break; 4701 case NVME_NUMBER_OF_QUEUES: 4702 result = (n->params.max_ioqpairs - 1) | 4703 ((n->params.max_ioqpairs - 1) << 16); 4704 trace_pci_nvme_getfeat_numq(result); 4705 break; 4706 case NVME_INTERRUPT_VECTOR_CONF: 4707 iv = dw11 & 0xffff; 4708 if (iv >= n->params.max_ioqpairs + 1) { 4709 return NVME_INVALID_FIELD | NVME_DNR; 4710 } 4711 4712 result = iv; 4713 if (iv == n->admin_cq.vector) { 4714 result |= NVME_INTVC_NOCOALESCING; 4715 } 4716 break; 4717 default: 4718 result = nvme_feature_default[fid]; 4719 break; 4720 } 4721 4722 out: 4723 req->cqe.result = cpu_to_le32(result); 4724 return NVME_SUCCESS; 4725 } 4726 4727 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) 4728 { 4729 uint16_t ret; 4730 uint64_t timestamp; 4731 4732 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req); 4733 if (ret) { 4734 return ret; 4735 } 4736 4737 nvme_set_timestamp(n, timestamp); 4738 4739 return NVME_SUCCESS; 4740 } 4741 4742 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) 4743 { 4744 NvmeNamespace *ns = NULL; 4745 4746 NvmeCmd *cmd = &req->cmd; 4747 uint32_t dw10 = le32_to_cpu(cmd->cdw10); 4748 uint32_t dw11 = le32_to_cpu(cmd->cdw11); 4749 uint32_t nsid = le32_to_cpu(cmd->nsid); 4750 uint8_t fid = NVME_GETSETFEAT_FID(dw10); 4751 uint8_t save = NVME_SETFEAT_SAVE(dw10); 4752 int i; 4753 4754 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11); 4755 4756 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) { 4757 return NVME_FID_NOT_SAVEABLE | NVME_DNR; 4758 } 4759 4760 if (!nvme_feature_support[fid]) { 4761 return NVME_INVALID_FIELD | NVME_DNR; 4762 } 4763 4764 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) { 4765 if (nsid != NVME_NSID_BROADCAST) { 4766 if (!nvme_nsid_valid(n, nsid)) { 4767 return NVME_INVALID_NSID | NVME_DNR; 4768 } 4769 4770 ns = nvme_ns(n, nsid); 4771 if (unlikely(!ns)) { 4772 return NVME_INVALID_FIELD | NVME_DNR; 4773 } 4774 } 4775 } else if (nsid && nsid != NVME_NSID_BROADCAST) { 4776 if (!nvme_nsid_valid(n, nsid)) { 4777 return NVME_INVALID_NSID | NVME_DNR; 4778 } 4779 4780 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR; 4781 } 4782 4783 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) { 4784 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; 4785 } 4786 4787 switch (fid) { 4788 case NVME_TEMPERATURE_THRESHOLD: 4789 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { 4790 break; 4791 } 4792 4793 switch (NVME_TEMP_THSEL(dw11)) { 4794 case NVME_TEMP_THSEL_OVER: 4795 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11); 4796 break; 4797 case NVME_TEMP_THSEL_UNDER: 4798 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11); 4799 break; 4800 default: 4801 return NVME_INVALID_FIELD | NVME_DNR; 4802 } 4803 4804 if ((n->temperature >= n->features.temp_thresh_hi) || 4805 (n->temperature <= n->features.temp_thresh_low)) { 4806 nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH); 4807 } 4808 4809 break; 4810 case NVME_ERROR_RECOVERY: 4811 if (nsid == NVME_NSID_BROADCAST) { 4812 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4813 ns = nvme_ns(n, i); 4814 4815 if (!ns) { 4816 continue; 4817 } 4818 4819 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) { 4820 ns->features.err_rec = dw11; 4821 } 4822 } 4823 4824 break; 4825 } 4826 4827 assert(ns); 4828 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) { 4829 ns->features.err_rec = dw11; 4830 } 4831 break; 4832 case NVME_VOLATILE_WRITE_CACHE: 4833 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4834 ns = nvme_ns(n, i); 4835 if (!ns) { 4836 continue; 4837 } 4838 4839 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) { 4840 blk_flush(ns->blkconf.blk); 4841 } 4842 4843 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1); 4844 } 4845 4846 break; 4847 4848 case NVME_NUMBER_OF_QUEUES: 4849 if (n->qs_created) { 4850 return NVME_CMD_SEQ_ERROR | NVME_DNR; 4851 } 4852 4853 /* 4854 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR 4855 * and NSQR. 4856 */ 4857 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) { 4858 return NVME_INVALID_FIELD | NVME_DNR; 4859 } 4860 4861 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1, 4862 ((dw11 >> 16) & 0xffff) + 1, 4863 n->params.max_ioqpairs, 4864 n->params.max_ioqpairs); 4865 req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) | 4866 ((n->params.max_ioqpairs - 1) << 16)); 4867 break; 4868 case NVME_ASYNCHRONOUS_EVENT_CONF: 4869 n->features.async_config = dw11; 4870 break; 4871 case NVME_TIMESTAMP: 4872 return nvme_set_feature_timestamp(n, req); 4873 case NVME_COMMAND_SET_PROFILE: 4874 if (dw11 & 0x1ff) { 4875 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff); 4876 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR; 4877 } 4878 break; 4879 default: 4880 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; 4881 } 4882 return NVME_SUCCESS; 4883 } 4884 4885 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req) 4886 { 4887 trace_pci_nvme_aer(nvme_cid(req)); 4888 4889 if (n->outstanding_aers > n->params.aerl) { 4890 trace_pci_nvme_aer_aerl_exceeded(); 4891 return NVME_AER_LIMIT_EXCEEDED; 4892 } 4893 4894 n->aer_reqs[n->outstanding_aers] = req; 4895 n->outstanding_aers++; 4896 4897 if (!QTAILQ_EMPTY(&n->aer_queue)) { 4898 nvme_process_aers(n); 4899 } 4900 4901 return NVME_NO_COMPLETE; 4902 } 4903 4904 static void nvme_update_dmrsl(NvmeCtrl *n) 4905 { 4906 int nsid; 4907 4908 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) { 4909 NvmeNamespace *ns = nvme_ns(n, nsid); 4910 if (!ns) { 4911 continue; 4912 } 4913 4914 n->dmrsl = MIN_NON_ZERO(n->dmrsl, 4915 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); 4916 } 4917 } 4918 4919 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns) 4920 { 4921 ns->iocs = nvme_cse_iocs_none; 4922 switch (ns->csi) { 4923 case NVME_CSI_NVM: 4924 if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) { 4925 ns->iocs = nvme_cse_iocs_nvm; 4926 } 4927 break; 4928 case NVME_CSI_ZONED: 4929 if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) { 4930 ns->iocs = nvme_cse_iocs_zoned; 4931 } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) { 4932 ns->iocs = nvme_cse_iocs_nvm; 4933 } 4934 break; 4935 } 4936 } 4937 4938 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req) 4939 { 4940 NvmeNamespace *ns; 4941 NvmeCtrl *ctrl; 4942 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; 4943 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 4944 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 4945 bool attach = !(dw10 & 0xf); 4946 uint16_t *nr_ids = &list[0]; 4947 uint16_t *ids = &list[1]; 4948 uint16_t ret; 4949 int i; 4950 4951 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf); 4952 4953 if (!nvme_nsid_valid(n, nsid)) { 4954 return NVME_INVALID_NSID | NVME_DNR; 4955 } 4956 4957 ns = nvme_subsys_ns(n->subsys, nsid); 4958 if (!ns) { 4959 return NVME_INVALID_FIELD | NVME_DNR; 4960 } 4961 4962 ret = nvme_h2c(n, (uint8_t *)list, 4096, req); 4963 if (ret) { 4964 return ret; 4965 } 4966 4967 if (!*nr_ids) { 4968 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR; 4969 } 4970 4971 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1); 4972 for (i = 0; i < *nr_ids; i++) { 4973 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]); 4974 if (!ctrl) { 4975 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR; 4976 } 4977 4978 if (attach) { 4979 if (nvme_ns(ctrl, nsid)) { 4980 return NVME_NS_ALREADY_ATTACHED | NVME_DNR; 4981 } 4982 4983 if (ns->attached && !ns->params.shared) { 4984 return NVME_NS_PRIVATE | NVME_DNR; 4985 } 4986 4987 nvme_attach_ns(ctrl, ns); 4988 nvme_select_iocs_ns(ctrl, ns); 4989 } else { 4990 if (!nvme_ns(ctrl, nsid)) { 4991 return NVME_NS_NOT_ATTACHED | NVME_DNR; 4992 } 4993 4994 ctrl->namespaces[nsid] = NULL; 4995 ns->attached--; 4996 4997 nvme_update_dmrsl(ctrl); 4998 } 4999 5000 /* 5001 * Add namespace id to the changed namespace id list for event clearing 5002 * via Get Log Page command. 5003 */ 5004 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) { 5005 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE, 5006 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED, 5007 NVME_LOG_CHANGED_NSLIST); 5008 } 5009 } 5010 5011 return NVME_SUCCESS; 5012 } 5013 5014 static uint16_t nvme_format_ns(NvmeCtrl *n, NvmeNamespace *ns, uint8_t lbaf, 5015 uint8_t mset, uint8_t pi, uint8_t pil, 5016 NvmeRequest *req) 5017 { 5018 int64_t len, offset; 5019 struct nvme_aio_format_ctx *ctx; 5020 BlockBackend *blk = ns->blkconf.blk; 5021 uint16_t ms; 5022 uintptr_t *num_formats = (uintptr_t *)&req->opaque; 5023 int *count; 5024 5025 if (ns->params.zoned) { 5026 return NVME_INVALID_FORMAT | NVME_DNR; 5027 } 5028 5029 trace_pci_nvme_format_ns(nvme_cid(req), nvme_nsid(ns), lbaf, mset, pi, pil); 5030 5031 if (lbaf > ns->id_ns.nlbaf) { 5032 return NVME_INVALID_FORMAT | NVME_DNR; 5033 } 5034 5035 ms = ns->id_ns.lbaf[lbaf].ms; 5036 5037 if (pi && (ms < sizeof(NvmeDifTuple))) { 5038 return NVME_INVALID_FORMAT | NVME_DNR; 5039 } 5040 5041 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) { 5042 return NVME_INVALID_FIELD | NVME_DNR; 5043 } 5044 5045 nvme_ns_drain(ns); 5046 nvme_ns_shutdown(ns); 5047 nvme_ns_cleanup(ns); 5048 5049 ns->id_ns.dps = (pil << 3) | pi; 5050 ns->id_ns.flbas = lbaf | (mset << 4); 5051 5052 nvme_ns_init_format(ns); 5053 5054 ns->status = NVME_FORMAT_IN_PROGRESS; 5055 5056 len = ns->size; 5057 offset = 0; 5058 5059 count = g_new(int, 1); 5060 *count = 1; 5061 5062 (*num_formats)++; 5063 5064 while (len) { 5065 ctx = g_new(struct nvme_aio_format_ctx, 1); 5066 ctx->req = req; 5067 ctx->ns = ns; 5068 ctx->count = count; 5069 5070 size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len); 5071 5072 (*count)++; 5073 5074 blk_aio_pwrite_zeroes(blk, offset, bytes, BDRV_REQ_MAY_UNMAP, 5075 nvme_aio_format_cb, ctx); 5076 5077 offset += bytes; 5078 len -= bytes; 5079 5080 } 5081 5082 if (--(*count)) { 5083 return NVME_NO_COMPLETE; 5084 } 5085 5086 g_free(count); 5087 ns->status = 0x0; 5088 (*num_formats)--; 5089 5090 return NVME_SUCCESS; 5091 } 5092 5093 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req) 5094 { 5095 NvmeNamespace *ns; 5096 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 5097 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 5098 uint8_t lbaf = dw10 & 0xf; 5099 uint8_t mset = (dw10 >> 4) & 0x1; 5100 uint8_t pi = (dw10 >> 5) & 0x7; 5101 uint8_t pil = (dw10 >> 8) & 0x1; 5102 uintptr_t *num_formats = (uintptr_t *)&req->opaque; 5103 uint16_t status; 5104 int i; 5105 5106 trace_pci_nvme_format(nvme_cid(req), nsid, lbaf, mset, pi, pil); 5107 5108 /* 1-initialize; see the comment in nvme_dsm */ 5109 *num_formats = 1; 5110 5111 if (nsid != NVME_NSID_BROADCAST) { 5112 if (!nvme_nsid_valid(n, nsid)) { 5113 return NVME_INVALID_NSID | NVME_DNR; 5114 } 5115 5116 ns = nvme_ns(n, nsid); 5117 if (!ns) { 5118 return NVME_INVALID_FIELD | NVME_DNR; 5119 } 5120 5121 status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req); 5122 if (status && status != NVME_NO_COMPLETE) { 5123 req->status = status; 5124 } 5125 } else { 5126 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5127 ns = nvme_ns(n, i); 5128 if (!ns) { 5129 continue; 5130 } 5131 5132 status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req); 5133 if (status && status != NVME_NO_COMPLETE) { 5134 req->status = status; 5135 break; 5136 } 5137 } 5138 } 5139 5140 /* account for the 1-initialization */ 5141 if (--(*num_formats)) { 5142 return NVME_NO_COMPLETE; 5143 } 5144 5145 return req->status; 5146 } 5147 5148 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) 5149 { 5150 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, 5151 nvme_adm_opc_str(req->cmd.opcode)); 5152 5153 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { 5154 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode); 5155 return NVME_INVALID_OPCODE | NVME_DNR; 5156 } 5157 5158 /* SGLs shall not be used for Admin commands in NVMe over PCIe */ 5159 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) { 5160 return NVME_INVALID_FIELD | NVME_DNR; 5161 } 5162 5163 switch (req->cmd.opcode) { 5164 case NVME_ADM_CMD_DELETE_SQ: 5165 return nvme_del_sq(n, req); 5166 case NVME_ADM_CMD_CREATE_SQ: 5167 return nvme_create_sq(n, req); 5168 case NVME_ADM_CMD_GET_LOG_PAGE: 5169 return nvme_get_log(n, req); 5170 case NVME_ADM_CMD_DELETE_CQ: 5171 return nvme_del_cq(n, req); 5172 case NVME_ADM_CMD_CREATE_CQ: 5173 return nvme_create_cq(n, req); 5174 case NVME_ADM_CMD_IDENTIFY: 5175 return nvme_identify(n, req); 5176 case NVME_ADM_CMD_ABORT: 5177 return nvme_abort(n, req); 5178 case NVME_ADM_CMD_SET_FEATURES: 5179 return nvme_set_feature(n, req); 5180 case NVME_ADM_CMD_GET_FEATURES: 5181 return nvme_get_feature(n, req); 5182 case NVME_ADM_CMD_ASYNC_EV_REQ: 5183 return nvme_aer(n, req); 5184 case NVME_ADM_CMD_NS_ATTACHMENT: 5185 return nvme_ns_attachment(n, req); 5186 case NVME_ADM_CMD_FORMAT_NVM: 5187 return nvme_format(n, req); 5188 default: 5189 assert(false); 5190 } 5191 5192 return NVME_INVALID_OPCODE | NVME_DNR; 5193 } 5194 5195 static void nvme_process_sq(void *opaque) 5196 { 5197 NvmeSQueue *sq = opaque; 5198 NvmeCtrl *n = sq->ctrl; 5199 NvmeCQueue *cq = n->cq[sq->cqid]; 5200 5201 uint16_t status; 5202 hwaddr addr; 5203 NvmeCmd cmd; 5204 NvmeRequest *req; 5205 5206 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) { 5207 addr = sq->dma_addr + sq->head * n->sqe_size; 5208 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) { 5209 trace_pci_nvme_err_addr_read(addr); 5210 trace_pci_nvme_err_cfs(); 5211 n->bar.csts = NVME_CSTS_FAILED; 5212 break; 5213 } 5214 nvme_inc_sq_head(sq); 5215 5216 req = QTAILQ_FIRST(&sq->req_list); 5217 QTAILQ_REMOVE(&sq->req_list, req, entry); 5218 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry); 5219 nvme_req_clear(req); 5220 req->cqe.cid = cmd.cid; 5221 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd)); 5222 5223 status = sq->sqid ? nvme_io_cmd(n, req) : 5224 nvme_admin_cmd(n, req); 5225 if (status != NVME_NO_COMPLETE) { 5226 req->status = status; 5227 nvme_enqueue_req_completion(cq, req); 5228 } 5229 } 5230 } 5231 5232 static void nvme_ctrl_reset(NvmeCtrl *n) 5233 { 5234 NvmeNamespace *ns; 5235 int i; 5236 5237 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5238 ns = nvme_ns(n, i); 5239 if (!ns) { 5240 continue; 5241 } 5242 5243 nvme_ns_drain(ns); 5244 } 5245 5246 for (i = 0; i < n->params.max_ioqpairs + 1; i++) { 5247 if (n->sq[i] != NULL) { 5248 nvme_free_sq(n->sq[i], n); 5249 } 5250 } 5251 for (i = 0; i < n->params.max_ioqpairs + 1; i++) { 5252 if (n->cq[i] != NULL) { 5253 nvme_free_cq(n->cq[i], n); 5254 } 5255 } 5256 5257 while (!QTAILQ_EMPTY(&n->aer_queue)) { 5258 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue); 5259 QTAILQ_REMOVE(&n->aer_queue, event, entry); 5260 g_free(event); 5261 } 5262 5263 n->aer_queued = 0; 5264 n->outstanding_aers = 0; 5265 n->qs_created = false; 5266 5267 n->bar.cc = 0; 5268 } 5269 5270 static void nvme_ctrl_shutdown(NvmeCtrl *n) 5271 { 5272 NvmeNamespace *ns; 5273 int i; 5274 5275 if (n->pmr.dev) { 5276 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size); 5277 } 5278 5279 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5280 ns = nvme_ns(n, i); 5281 if (!ns) { 5282 continue; 5283 } 5284 5285 nvme_ns_shutdown(ns); 5286 } 5287 } 5288 5289 static void nvme_select_iocs(NvmeCtrl *n) 5290 { 5291 NvmeNamespace *ns; 5292 int i; 5293 5294 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5295 ns = nvme_ns(n, i); 5296 if (!ns) { 5297 continue; 5298 } 5299 5300 nvme_select_iocs_ns(n, ns); 5301 } 5302 } 5303 5304 static int nvme_start_ctrl(NvmeCtrl *n) 5305 { 5306 uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12; 5307 uint32_t page_size = 1 << page_bits; 5308 5309 if (unlikely(n->cq[0])) { 5310 trace_pci_nvme_err_startfail_cq(); 5311 return -1; 5312 } 5313 if (unlikely(n->sq[0])) { 5314 trace_pci_nvme_err_startfail_sq(); 5315 return -1; 5316 } 5317 if (unlikely(!n->bar.asq)) { 5318 trace_pci_nvme_err_startfail_nbarasq(); 5319 return -1; 5320 } 5321 if (unlikely(!n->bar.acq)) { 5322 trace_pci_nvme_err_startfail_nbaracq(); 5323 return -1; 5324 } 5325 if (unlikely(n->bar.asq & (page_size - 1))) { 5326 trace_pci_nvme_err_startfail_asq_misaligned(n->bar.asq); 5327 return -1; 5328 } 5329 if (unlikely(n->bar.acq & (page_size - 1))) { 5330 trace_pci_nvme_err_startfail_acq_misaligned(n->bar.acq); 5331 return -1; 5332 } 5333 if (unlikely(!(NVME_CAP_CSS(n->bar.cap) & (1 << NVME_CC_CSS(n->bar.cc))))) { 5334 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(n->bar.cc)); 5335 return -1; 5336 } 5337 if (unlikely(NVME_CC_MPS(n->bar.cc) < 5338 NVME_CAP_MPSMIN(n->bar.cap))) { 5339 trace_pci_nvme_err_startfail_page_too_small( 5340 NVME_CC_MPS(n->bar.cc), 5341 NVME_CAP_MPSMIN(n->bar.cap)); 5342 return -1; 5343 } 5344 if (unlikely(NVME_CC_MPS(n->bar.cc) > 5345 NVME_CAP_MPSMAX(n->bar.cap))) { 5346 trace_pci_nvme_err_startfail_page_too_large( 5347 NVME_CC_MPS(n->bar.cc), 5348 NVME_CAP_MPSMAX(n->bar.cap)); 5349 return -1; 5350 } 5351 if (unlikely(NVME_CC_IOCQES(n->bar.cc) < 5352 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) { 5353 trace_pci_nvme_err_startfail_cqent_too_small( 5354 NVME_CC_IOCQES(n->bar.cc), 5355 NVME_CTRL_CQES_MIN(n->bar.cap)); 5356 return -1; 5357 } 5358 if (unlikely(NVME_CC_IOCQES(n->bar.cc) > 5359 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) { 5360 trace_pci_nvme_err_startfail_cqent_too_large( 5361 NVME_CC_IOCQES(n->bar.cc), 5362 NVME_CTRL_CQES_MAX(n->bar.cap)); 5363 return -1; 5364 } 5365 if (unlikely(NVME_CC_IOSQES(n->bar.cc) < 5366 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) { 5367 trace_pci_nvme_err_startfail_sqent_too_small( 5368 NVME_CC_IOSQES(n->bar.cc), 5369 NVME_CTRL_SQES_MIN(n->bar.cap)); 5370 return -1; 5371 } 5372 if (unlikely(NVME_CC_IOSQES(n->bar.cc) > 5373 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) { 5374 trace_pci_nvme_err_startfail_sqent_too_large( 5375 NVME_CC_IOSQES(n->bar.cc), 5376 NVME_CTRL_SQES_MAX(n->bar.cap)); 5377 return -1; 5378 } 5379 if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) { 5380 trace_pci_nvme_err_startfail_asqent_sz_zero(); 5381 return -1; 5382 } 5383 if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) { 5384 trace_pci_nvme_err_startfail_acqent_sz_zero(); 5385 return -1; 5386 } 5387 5388 n->page_bits = page_bits; 5389 n->page_size = page_size; 5390 n->max_prp_ents = n->page_size / sizeof(uint64_t); 5391 n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc); 5392 n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc); 5393 nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0, 5394 NVME_AQA_ACQS(n->bar.aqa) + 1, 1); 5395 nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0, 5396 NVME_AQA_ASQS(n->bar.aqa) + 1); 5397 5398 nvme_set_timestamp(n, 0ULL); 5399 5400 QTAILQ_INIT(&n->aer_queue); 5401 5402 nvme_select_iocs(n); 5403 5404 return 0; 5405 } 5406 5407 static void nvme_cmb_enable_regs(NvmeCtrl *n) 5408 { 5409 NVME_CMBLOC_SET_CDPCILS(n->bar.cmbloc, 1); 5410 NVME_CMBLOC_SET_CDPMLS(n->bar.cmbloc, 1); 5411 NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR); 5412 5413 NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1); 5414 NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0); 5415 NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1); 5416 NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1); 5417 NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1); 5418 NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */ 5419 NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb); 5420 } 5421 5422 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, 5423 unsigned size) 5424 { 5425 if (unlikely(offset & (sizeof(uint32_t) - 1))) { 5426 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32, 5427 "MMIO write not 32-bit aligned," 5428 " offset=0x%"PRIx64"", offset); 5429 /* should be ignored, fall through for now */ 5430 } 5431 5432 if (unlikely(size < sizeof(uint32_t))) { 5433 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall, 5434 "MMIO write smaller than 32-bits," 5435 " offset=0x%"PRIx64", size=%u", 5436 offset, size); 5437 /* should be ignored, fall through for now */ 5438 } 5439 5440 switch (offset) { 5441 case 0xc: /* INTMS */ 5442 if (unlikely(msix_enabled(&(n->parent_obj)))) { 5443 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix, 5444 "undefined access to interrupt mask set" 5445 " when MSI-X is enabled"); 5446 /* should be ignored, fall through for now */ 5447 } 5448 n->bar.intms |= data & 0xffffffff; 5449 n->bar.intmc = n->bar.intms; 5450 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, n->bar.intmc); 5451 nvme_irq_check(n); 5452 break; 5453 case 0x10: /* INTMC */ 5454 if (unlikely(msix_enabled(&(n->parent_obj)))) { 5455 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix, 5456 "undefined access to interrupt mask clr" 5457 " when MSI-X is enabled"); 5458 /* should be ignored, fall through for now */ 5459 } 5460 n->bar.intms &= ~(data & 0xffffffff); 5461 n->bar.intmc = n->bar.intms; 5462 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, n->bar.intmc); 5463 nvme_irq_check(n); 5464 break; 5465 case 0x14: /* CC */ 5466 trace_pci_nvme_mmio_cfg(data & 0xffffffff); 5467 /* Windows first sends data, then sends enable bit */ 5468 if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) && 5469 !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc)) 5470 { 5471 n->bar.cc = data; 5472 } 5473 5474 if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) { 5475 n->bar.cc = data; 5476 if (unlikely(nvme_start_ctrl(n))) { 5477 trace_pci_nvme_err_startfail(); 5478 n->bar.csts = NVME_CSTS_FAILED; 5479 } else { 5480 trace_pci_nvme_mmio_start_success(); 5481 n->bar.csts = NVME_CSTS_READY; 5482 } 5483 } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) { 5484 trace_pci_nvme_mmio_stopped(); 5485 nvme_ctrl_reset(n); 5486 n->bar.csts &= ~NVME_CSTS_READY; 5487 } 5488 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) { 5489 trace_pci_nvme_mmio_shutdown_set(); 5490 nvme_ctrl_shutdown(n); 5491 n->bar.cc = data; 5492 n->bar.csts |= NVME_CSTS_SHST_COMPLETE; 5493 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) { 5494 trace_pci_nvme_mmio_shutdown_cleared(); 5495 n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE; 5496 n->bar.cc = data; 5497 } 5498 break; 5499 case 0x1c: /* CSTS */ 5500 if (data & (1 << 4)) { 5501 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported, 5502 "attempted to W1C CSTS.NSSRO" 5503 " but CAP.NSSRS is zero (not supported)"); 5504 } else if (data != 0) { 5505 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts, 5506 "attempted to set a read only bit" 5507 " of controller status"); 5508 } 5509 break; 5510 case 0x20: /* NSSR */ 5511 if (data == 0x4e564d65) { 5512 trace_pci_nvme_ub_mmiowr_ssreset_unsupported(); 5513 } else { 5514 /* The spec says that writes of other values have no effect */ 5515 return; 5516 } 5517 break; 5518 case 0x24: /* AQA */ 5519 n->bar.aqa = data & 0xffffffff; 5520 trace_pci_nvme_mmio_aqattr(data & 0xffffffff); 5521 break; 5522 case 0x28: /* ASQ */ 5523 n->bar.asq = size == 8 ? data : 5524 (n->bar.asq & ~0xffffffffULL) | (data & 0xffffffff); 5525 trace_pci_nvme_mmio_asqaddr(data); 5526 break; 5527 case 0x2c: /* ASQ hi */ 5528 n->bar.asq = (n->bar.asq & 0xffffffff) | (data << 32); 5529 trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq); 5530 break; 5531 case 0x30: /* ACQ */ 5532 trace_pci_nvme_mmio_acqaddr(data); 5533 n->bar.acq = size == 8 ? data : 5534 (n->bar.acq & ~0xffffffffULL) | (data & 0xffffffff); 5535 break; 5536 case 0x34: /* ACQ hi */ 5537 n->bar.acq = (n->bar.acq & 0xffffffff) | (data << 32); 5538 trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq); 5539 break; 5540 case 0x38: /* CMBLOC */ 5541 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved, 5542 "invalid write to reserved CMBLOC" 5543 " when CMBSZ is zero, ignored"); 5544 return; 5545 case 0x3C: /* CMBSZ */ 5546 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly, 5547 "invalid write to read only CMBSZ, ignored"); 5548 return; 5549 case 0x50: /* CMBMSC */ 5550 if (!NVME_CAP_CMBS(n->bar.cap)) { 5551 return; 5552 } 5553 5554 n->bar.cmbmsc = size == 8 ? data : 5555 (n->bar.cmbmsc & ~0xffffffff) | (data & 0xffffffff); 5556 n->cmb.cmse = false; 5557 5558 if (NVME_CMBMSC_CRE(data)) { 5559 nvme_cmb_enable_regs(n); 5560 5561 if (NVME_CMBMSC_CMSE(data)) { 5562 hwaddr cba = NVME_CMBMSC_CBA(data) << CMBMSC_CBA_SHIFT; 5563 if (cba + int128_get64(n->cmb.mem.size) < cba) { 5564 NVME_CMBSTS_SET_CBAI(n->bar.cmbsts, 1); 5565 return; 5566 } 5567 5568 n->cmb.cba = cba; 5569 n->cmb.cmse = true; 5570 } 5571 } else { 5572 n->bar.cmbsz = 0; 5573 n->bar.cmbloc = 0; 5574 } 5575 5576 return; 5577 case 0x54: /* CMBMSC hi */ 5578 n->bar.cmbmsc = (n->bar.cmbmsc & 0xffffffff) | (data << 32); 5579 return; 5580 5581 case 0xe00: /* PMRCAP */ 5582 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly, 5583 "invalid write to PMRCAP register, ignored"); 5584 return; 5585 case 0xe04: /* PMRCTL */ 5586 n->bar.pmrctl = data; 5587 if (NVME_PMRCTL_EN(data)) { 5588 memory_region_set_enabled(&n->pmr.dev->mr, true); 5589 n->bar.pmrsts = 0; 5590 } else { 5591 memory_region_set_enabled(&n->pmr.dev->mr, false); 5592 NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1); 5593 n->pmr.cmse = false; 5594 } 5595 return; 5596 case 0xe08: /* PMRSTS */ 5597 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly, 5598 "invalid write to PMRSTS register, ignored"); 5599 return; 5600 case 0xe0C: /* PMREBS */ 5601 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly, 5602 "invalid write to PMREBS register, ignored"); 5603 return; 5604 case 0xe10: /* PMRSWTP */ 5605 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly, 5606 "invalid write to PMRSWTP register, ignored"); 5607 return; 5608 case 0xe14: /* PMRMSCL */ 5609 if (!NVME_CAP_PMRS(n->bar.cap)) { 5610 return; 5611 } 5612 5613 n->bar.pmrmsc = (n->bar.pmrmsc & ~0xffffffff) | (data & 0xffffffff); 5614 n->pmr.cmse = false; 5615 5616 if (NVME_PMRMSC_CMSE(n->bar.pmrmsc)) { 5617 hwaddr cba = NVME_PMRMSC_CBA(n->bar.pmrmsc) << PMRMSC_CBA_SHIFT; 5618 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) { 5619 NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 1); 5620 return; 5621 } 5622 5623 n->pmr.cmse = true; 5624 n->pmr.cba = cba; 5625 } 5626 5627 return; 5628 case 0xe18: /* PMRMSCU */ 5629 if (!NVME_CAP_PMRS(n->bar.cap)) { 5630 return; 5631 } 5632 5633 n->bar.pmrmsc = (n->bar.pmrmsc & 0xffffffff) | (data << 32); 5634 return; 5635 default: 5636 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid, 5637 "invalid MMIO write," 5638 " offset=0x%"PRIx64", data=%"PRIx64"", 5639 offset, data); 5640 break; 5641 } 5642 } 5643 5644 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) 5645 { 5646 NvmeCtrl *n = (NvmeCtrl *)opaque; 5647 uint8_t *ptr = (uint8_t *)&n->bar; 5648 uint64_t val = 0; 5649 5650 trace_pci_nvme_mmio_read(addr, size); 5651 5652 if (unlikely(addr & (sizeof(uint32_t) - 1))) { 5653 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32, 5654 "MMIO read not 32-bit aligned," 5655 " offset=0x%"PRIx64"", addr); 5656 /* should RAZ, fall through for now */ 5657 } else if (unlikely(size < sizeof(uint32_t))) { 5658 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall, 5659 "MMIO read smaller than 32-bits," 5660 " offset=0x%"PRIx64"", addr); 5661 /* should RAZ, fall through for now */ 5662 } 5663 5664 if (addr < sizeof(n->bar)) { 5665 /* 5666 * When PMRWBM bit 1 is set then read from 5667 * from PMRSTS should ensure prior writes 5668 * made it to persistent media 5669 */ 5670 if (addr == 0xe08 && 5671 (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) { 5672 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size); 5673 } 5674 memcpy(&val, ptr + addr, size); 5675 } else { 5676 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs, 5677 "MMIO read beyond last register," 5678 " offset=0x%"PRIx64", returning 0", addr); 5679 } 5680 5681 return val; 5682 } 5683 5684 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) 5685 { 5686 uint32_t qid; 5687 5688 if (unlikely(addr & ((1 << 2) - 1))) { 5689 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned, 5690 "doorbell write not 32-bit aligned," 5691 " offset=0x%"PRIx64", ignoring", addr); 5692 return; 5693 } 5694 5695 if (((addr - 0x1000) >> 2) & 1) { 5696 /* Completion queue doorbell write */ 5697 5698 uint16_t new_head = val & 0xffff; 5699 int start_sqs; 5700 NvmeCQueue *cq; 5701 5702 qid = (addr - (0x1000 + (1 << 2))) >> 3; 5703 if (unlikely(nvme_check_cqid(n, qid))) { 5704 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq, 5705 "completion queue doorbell write" 5706 " for nonexistent queue," 5707 " sqid=%"PRIu32", ignoring", qid); 5708 5709 /* 5710 * NVM Express v1.3d, Section 4.1 state: "If host software writes 5711 * an invalid value to the Submission Queue Tail Doorbell or 5712 * Completion Queue Head Doorbell regiter and an Asynchronous Event 5713 * Request command is outstanding, then an asynchronous event is 5714 * posted to the Admin Completion Queue with a status code of 5715 * Invalid Doorbell Write Value." 5716 * 5717 * Also note that the spec includes the "Invalid Doorbell Register" 5718 * status code, but nowhere does it specify when to use it. 5719 * However, it seems reasonable to use it here in a similar 5720 * fashion. 5721 */ 5722 if (n->outstanding_aers) { 5723 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 5724 NVME_AER_INFO_ERR_INVALID_DB_REGISTER, 5725 NVME_LOG_ERROR_INFO); 5726 } 5727 5728 return; 5729 } 5730 5731 cq = n->cq[qid]; 5732 if (unlikely(new_head >= cq->size)) { 5733 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead, 5734 "completion queue doorbell write value" 5735 " beyond queue size, sqid=%"PRIu32"," 5736 " new_head=%"PRIu16", ignoring", 5737 qid, new_head); 5738 5739 if (n->outstanding_aers) { 5740 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 5741 NVME_AER_INFO_ERR_INVALID_DB_VALUE, 5742 NVME_LOG_ERROR_INFO); 5743 } 5744 5745 return; 5746 } 5747 5748 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head); 5749 5750 start_sqs = nvme_cq_full(cq) ? 1 : 0; 5751 cq->head = new_head; 5752 if (start_sqs) { 5753 NvmeSQueue *sq; 5754 QTAILQ_FOREACH(sq, &cq->sq_list, entry) { 5755 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); 5756 } 5757 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); 5758 } 5759 5760 if (cq->tail == cq->head) { 5761 nvme_irq_deassert(n, cq); 5762 } 5763 } else { 5764 /* Submission queue doorbell write */ 5765 5766 uint16_t new_tail = val & 0xffff; 5767 NvmeSQueue *sq; 5768 5769 qid = (addr - 0x1000) >> 3; 5770 if (unlikely(nvme_check_sqid(n, qid))) { 5771 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq, 5772 "submission queue doorbell write" 5773 " for nonexistent queue," 5774 " sqid=%"PRIu32", ignoring", qid); 5775 5776 if (n->outstanding_aers) { 5777 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 5778 NVME_AER_INFO_ERR_INVALID_DB_REGISTER, 5779 NVME_LOG_ERROR_INFO); 5780 } 5781 5782 return; 5783 } 5784 5785 sq = n->sq[qid]; 5786 if (unlikely(new_tail >= sq->size)) { 5787 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail, 5788 "submission queue doorbell write value" 5789 " beyond queue size, sqid=%"PRIu32"," 5790 " new_tail=%"PRIu16", ignoring", 5791 qid, new_tail); 5792 5793 if (n->outstanding_aers) { 5794 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 5795 NVME_AER_INFO_ERR_INVALID_DB_VALUE, 5796 NVME_LOG_ERROR_INFO); 5797 } 5798 5799 return; 5800 } 5801 5802 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail); 5803 5804 sq->tail = new_tail; 5805 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); 5806 } 5807 } 5808 5809 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data, 5810 unsigned size) 5811 { 5812 NvmeCtrl *n = (NvmeCtrl *)opaque; 5813 5814 trace_pci_nvme_mmio_write(addr, data, size); 5815 5816 if (addr < sizeof(n->bar)) { 5817 nvme_write_bar(n, addr, data, size); 5818 } else { 5819 nvme_process_db(n, addr, data); 5820 } 5821 } 5822 5823 static const MemoryRegionOps nvme_mmio_ops = { 5824 .read = nvme_mmio_read, 5825 .write = nvme_mmio_write, 5826 .endianness = DEVICE_LITTLE_ENDIAN, 5827 .impl = { 5828 .min_access_size = 2, 5829 .max_access_size = 8, 5830 }, 5831 }; 5832 5833 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data, 5834 unsigned size) 5835 { 5836 NvmeCtrl *n = (NvmeCtrl *)opaque; 5837 stn_le_p(&n->cmb.buf[addr], size, data); 5838 } 5839 5840 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size) 5841 { 5842 NvmeCtrl *n = (NvmeCtrl *)opaque; 5843 return ldn_le_p(&n->cmb.buf[addr], size); 5844 } 5845 5846 static const MemoryRegionOps nvme_cmb_ops = { 5847 .read = nvme_cmb_read, 5848 .write = nvme_cmb_write, 5849 .endianness = DEVICE_LITTLE_ENDIAN, 5850 .impl = { 5851 .min_access_size = 1, 5852 .max_access_size = 8, 5853 }, 5854 }; 5855 5856 static void nvme_check_constraints(NvmeCtrl *n, Error **errp) 5857 { 5858 NvmeParams *params = &n->params; 5859 5860 if (params->num_queues) { 5861 warn_report("num_queues is deprecated; please use max_ioqpairs " 5862 "instead"); 5863 5864 params->max_ioqpairs = params->num_queues - 1; 5865 } 5866 5867 if (n->namespace.blkconf.blk && n->subsys) { 5868 error_setg(errp, "subsystem support is unavailable with legacy " 5869 "namespace ('drive' property)"); 5870 return; 5871 } 5872 5873 if (params->max_ioqpairs < 1 || 5874 params->max_ioqpairs > NVME_MAX_IOQPAIRS) { 5875 error_setg(errp, "max_ioqpairs must be between 1 and %d", 5876 NVME_MAX_IOQPAIRS); 5877 return; 5878 } 5879 5880 if (params->msix_qsize < 1 || 5881 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) { 5882 error_setg(errp, "msix_qsize must be between 1 and %d", 5883 PCI_MSIX_FLAGS_QSIZE + 1); 5884 return; 5885 } 5886 5887 if (!params->serial) { 5888 error_setg(errp, "serial property not set"); 5889 return; 5890 } 5891 5892 if (n->pmr.dev) { 5893 if (host_memory_backend_is_mapped(n->pmr.dev)) { 5894 error_setg(errp, "can't use already busy memdev: %s", 5895 object_get_canonical_path_component(OBJECT(n->pmr.dev))); 5896 return; 5897 } 5898 5899 if (!is_power_of_2(n->pmr.dev->size)) { 5900 error_setg(errp, "pmr backend size needs to be power of 2 in size"); 5901 return; 5902 } 5903 5904 host_memory_backend_set_mapped(n->pmr.dev, true); 5905 } 5906 5907 if (n->params.zasl > n->params.mdts) { 5908 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less " 5909 "than or equal to mdts (Maximum Data Transfer Size)"); 5910 return; 5911 } 5912 5913 if (!n->params.vsl) { 5914 error_setg(errp, "vsl must be non-zero"); 5915 return; 5916 } 5917 } 5918 5919 static void nvme_init_state(NvmeCtrl *n) 5920 { 5921 /* add one to max_ioqpairs to account for the admin queue pair */ 5922 n->reg_size = pow2ceil(sizeof(NvmeBar) + 5923 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE); 5924 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1); 5925 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1); 5926 n->temperature = NVME_TEMPERATURE; 5927 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING; 5928 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 5929 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1); 5930 } 5931 5932 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) 5933 { 5934 uint64_t cmb_size = n->params.cmb_size_mb * MiB; 5935 5936 n->cmb.buf = g_malloc0(cmb_size); 5937 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n, 5938 "nvme-cmb", cmb_size); 5939 pci_register_bar(pci_dev, NVME_CMB_BIR, 5940 PCI_BASE_ADDRESS_SPACE_MEMORY | 5941 PCI_BASE_ADDRESS_MEM_TYPE_64 | 5942 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem); 5943 5944 NVME_CAP_SET_CMBS(n->bar.cap, 1); 5945 5946 if (n->params.legacy_cmb) { 5947 nvme_cmb_enable_regs(n); 5948 n->cmb.cmse = true; 5949 } 5950 } 5951 5952 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) 5953 { 5954 NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 1); 5955 NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 1); 5956 NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR); 5957 /* Turn on bit 1 support */ 5958 NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02); 5959 NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 1); 5960 5961 pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap), 5962 PCI_BASE_ADDRESS_SPACE_MEMORY | 5963 PCI_BASE_ADDRESS_MEM_TYPE_64 | 5964 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr); 5965 5966 memory_region_set_enabled(&n->pmr.dev->mr, false); 5967 } 5968 5969 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) 5970 { 5971 uint8_t *pci_conf = pci_dev->config; 5972 uint64_t bar_size, msix_table_size, msix_pba_size; 5973 unsigned msix_table_offset, msix_pba_offset; 5974 int ret; 5975 5976 Error *err = NULL; 5977 5978 pci_conf[PCI_INTERRUPT_PIN] = 1; 5979 pci_config_set_prog_interface(pci_conf, 0x2); 5980 5981 if (n->params.use_intel_id) { 5982 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL); 5983 pci_config_set_device_id(pci_conf, 0x5845); 5984 } else { 5985 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT); 5986 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME); 5987 } 5988 5989 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS); 5990 pcie_endpoint_cap_init(pci_dev, 0x80); 5991 5992 bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB); 5993 msix_table_offset = bar_size; 5994 msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize; 5995 5996 bar_size += msix_table_size; 5997 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB); 5998 msix_pba_offset = bar_size; 5999 msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8; 6000 6001 bar_size += msix_pba_size; 6002 bar_size = pow2ceil(bar_size); 6003 6004 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size); 6005 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme", 6006 n->reg_size); 6007 memory_region_add_subregion(&n->bar0, 0, &n->iomem); 6008 6009 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | 6010 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0); 6011 ret = msix_init(pci_dev, n->params.msix_qsize, 6012 &n->bar0, 0, msix_table_offset, 6013 &n->bar0, 0, msix_pba_offset, 0, &err); 6014 if (ret < 0) { 6015 if (ret == -ENOTSUP) { 6016 warn_report_err(err); 6017 } else { 6018 error_propagate(errp, err); 6019 return ret; 6020 } 6021 } 6022 6023 if (n->params.cmb_size_mb) { 6024 nvme_init_cmb(n, pci_dev); 6025 } 6026 6027 if (n->pmr.dev) { 6028 nvme_init_pmr(n, pci_dev); 6029 } 6030 6031 return 0; 6032 } 6033 6034 static void nvme_init_subnqn(NvmeCtrl *n) 6035 { 6036 NvmeSubsystem *subsys = n->subsys; 6037 NvmeIdCtrl *id = &n->id_ctrl; 6038 6039 if (!subsys) { 6040 snprintf((char *)id->subnqn, sizeof(id->subnqn), 6041 "nqn.2019-08.org.qemu:%s", n->params.serial); 6042 } else { 6043 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn); 6044 } 6045 } 6046 6047 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) 6048 { 6049 NvmeIdCtrl *id = &n->id_ctrl; 6050 uint8_t *pci_conf = pci_dev->config; 6051 6052 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID)); 6053 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID)); 6054 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' '); 6055 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' '); 6056 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' '); 6057 6058 id->cntlid = cpu_to_le16(n->cntlid); 6059 6060 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR); 6061 6062 id->rab = 6; 6063 6064 if (n->params.use_intel_id) { 6065 id->ieee[0] = 0xb3; 6066 id->ieee[1] = 0x02; 6067 id->ieee[2] = 0x00; 6068 } else { 6069 id->ieee[0] = 0x00; 6070 id->ieee[1] = 0x54; 6071 id->ieee[2] = 0x52; 6072 } 6073 6074 id->mdts = n->params.mdts; 6075 id->ver = cpu_to_le32(NVME_SPEC_VER); 6076 id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT); 6077 id->cntrltype = 0x1; 6078 6079 /* 6080 * Because the controller always completes the Abort command immediately, 6081 * there can never be more than one concurrently executing Abort command, 6082 * so this value is never used for anything. Note that there can easily be 6083 * many Abort commands in the queues, but they are not considered 6084 * "executing" until processed by nvme_abort. 6085 * 6086 * The specification recommends a value of 3 for Abort Command Limit (four 6087 * concurrently outstanding Abort commands), so lets use that though it is 6088 * inconsequential. 6089 */ 6090 id->acl = 3; 6091 id->aerl = n->params.aerl; 6092 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO; 6093 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED; 6094 6095 /* recommended default value (~70 C) */ 6096 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING); 6097 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL); 6098 6099 id->sqes = (0x6 << 4) | 0x6; 6100 id->cqes = (0x4 << 4) | 0x4; 6101 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES); 6102 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | 6103 NVME_ONCS_FEATURES | NVME_ONCS_DSM | 6104 NVME_ONCS_COMPARE | NVME_ONCS_COPY); 6105 6106 /* 6107 * NOTE: If this device ever supports a command set that does NOT use 0x0 6108 * as a Flush-equivalent operation, support for the broadcast NSID in Flush 6109 * should probably be removed. 6110 * 6111 * See comment in nvme_io_cmd. 6112 */ 6113 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT; 6114 6115 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0); 6116 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | 6117 NVME_CTRL_SGLS_BITBUCKET); 6118 6119 nvme_init_subnqn(n); 6120 6121 id->psd[0].mp = cpu_to_le16(0x9c4); 6122 id->psd[0].enlat = cpu_to_le32(0x10); 6123 id->psd[0].exlat = cpu_to_le32(0x4); 6124 6125 if (n->subsys) { 6126 id->cmic |= NVME_CMIC_MULTI_CTRL; 6127 } 6128 6129 NVME_CAP_SET_MQES(n->bar.cap, 0x7ff); 6130 NVME_CAP_SET_CQR(n->bar.cap, 1); 6131 NVME_CAP_SET_TO(n->bar.cap, 0xf); 6132 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM); 6133 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP); 6134 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY); 6135 NVME_CAP_SET_MPSMAX(n->bar.cap, 4); 6136 NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0); 6137 NVME_CAP_SET_PMRS(n->bar.cap, n->pmr.dev ? 1 : 0); 6138 6139 n->bar.vs = NVME_SPEC_VER; 6140 n->bar.intmc = n->bar.intms = 0; 6141 } 6142 6143 static int nvme_init_subsys(NvmeCtrl *n, Error **errp) 6144 { 6145 int cntlid; 6146 6147 if (!n->subsys) { 6148 return 0; 6149 } 6150 6151 cntlid = nvme_subsys_register_ctrl(n, errp); 6152 if (cntlid < 0) { 6153 return -1; 6154 } 6155 6156 n->cntlid = cntlid; 6157 6158 return 0; 6159 } 6160 6161 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns) 6162 { 6163 uint32_t nsid = ns->params.nsid; 6164 assert(nsid && nsid <= NVME_MAX_NAMESPACES); 6165 6166 n->namespaces[nsid] = ns; 6167 ns->attached++; 6168 6169 n->dmrsl = MIN_NON_ZERO(n->dmrsl, 6170 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); 6171 } 6172 6173 static void nvme_realize(PCIDevice *pci_dev, Error **errp) 6174 { 6175 NvmeCtrl *n = NVME(pci_dev); 6176 NvmeNamespace *ns; 6177 Error *local_err = NULL; 6178 6179 nvme_check_constraints(n, &local_err); 6180 if (local_err) { 6181 error_propagate(errp, local_err); 6182 return; 6183 } 6184 6185 qbus_create_inplace(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, 6186 &pci_dev->qdev, n->parent_obj.qdev.id); 6187 6188 nvme_init_state(n); 6189 if (nvme_init_pci(n, pci_dev, errp)) { 6190 return; 6191 } 6192 6193 if (nvme_init_subsys(n, errp)) { 6194 error_propagate(errp, local_err); 6195 return; 6196 } 6197 nvme_init_ctrl(n, pci_dev); 6198 6199 /* setup a namespace if the controller drive property was given */ 6200 if (n->namespace.blkconf.blk) { 6201 ns = &n->namespace; 6202 ns->params.nsid = 1; 6203 6204 if (nvme_ns_setup(n, ns, errp)) { 6205 return; 6206 } 6207 6208 nvme_attach_ns(n, ns); 6209 } 6210 } 6211 6212 static void nvme_exit(PCIDevice *pci_dev) 6213 { 6214 NvmeCtrl *n = NVME(pci_dev); 6215 NvmeNamespace *ns; 6216 int i; 6217 6218 nvme_ctrl_reset(n); 6219 6220 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 6221 ns = nvme_ns(n, i); 6222 if (!ns) { 6223 continue; 6224 } 6225 6226 nvme_ns_cleanup(ns); 6227 } 6228 6229 g_free(n->cq); 6230 g_free(n->sq); 6231 g_free(n->aer_reqs); 6232 6233 if (n->params.cmb_size_mb) { 6234 g_free(n->cmb.buf); 6235 } 6236 6237 if (n->pmr.dev) { 6238 host_memory_backend_set_mapped(n->pmr.dev, false); 6239 } 6240 msix_uninit(pci_dev, &n->bar0, &n->bar0); 6241 memory_region_del_subregion(&n->bar0, &n->iomem); 6242 } 6243 6244 static Property nvme_props[] = { 6245 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf), 6246 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND, 6247 HostMemoryBackend *), 6248 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS, 6249 NvmeSubsystem *), 6250 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial), 6251 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0), 6252 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0), 6253 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64), 6254 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65), 6255 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3), 6256 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64), 6257 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7), 6258 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7), 6259 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), 6260 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), 6261 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), 6262 DEFINE_PROP_END_OF_LIST(), 6263 }; 6264 6265 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name, 6266 void *opaque, Error **errp) 6267 { 6268 NvmeCtrl *n = NVME(obj); 6269 uint8_t value = n->smart_critical_warning; 6270 6271 visit_type_uint8(v, name, &value, errp); 6272 } 6273 6274 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name, 6275 void *opaque, Error **errp) 6276 { 6277 NvmeCtrl *n = NVME(obj); 6278 uint8_t value, old_value, cap = 0, index, event; 6279 6280 if (!visit_type_uint8(v, name, &value, errp)) { 6281 return; 6282 } 6283 6284 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY 6285 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA; 6286 if (NVME_CAP_PMRS(n->bar.cap)) { 6287 cap |= NVME_SMART_PMR_UNRELIABLE; 6288 } 6289 6290 if ((value & cap) != value) { 6291 error_setg(errp, "unsupported smart critical warning bits: 0x%x", 6292 value & ~cap); 6293 return; 6294 } 6295 6296 old_value = n->smart_critical_warning; 6297 n->smart_critical_warning = value; 6298 6299 /* only inject new bits of smart critical warning */ 6300 for (index = 0; index < NVME_SMART_WARN_MAX; index++) { 6301 event = 1 << index; 6302 if (value & ~old_value & event) 6303 nvme_smart_event(n, event); 6304 } 6305 } 6306 6307 static const VMStateDescription nvme_vmstate = { 6308 .name = "nvme", 6309 .unmigratable = 1, 6310 }; 6311 6312 static void nvme_class_init(ObjectClass *oc, void *data) 6313 { 6314 DeviceClass *dc = DEVICE_CLASS(oc); 6315 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc); 6316 6317 pc->realize = nvme_realize; 6318 pc->exit = nvme_exit; 6319 pc->class_id = PCI_CLASS_STORAGE_EXPRESS; 6320 pc->revision = 2; 6321 6322 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); 6323 dc->desc = "Non-Volatile Memory Express"; 6324 device_class_set_props(dc, nvme_props); 6325 dc->vmsd = &nvme_vmstate; 6326 } 6327 6328 static void nvme_instance_init(Object *obj) 6329 { 6330 NvmeCtrl *n = NVME(obj); 6331 6332 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex, 6333 "bootindex", "/namespace@1,0", 6334 DEVICE(obj)); 6335 6336 object_property_add(obj, "smart_critical_warning", "uint8", 6337 nvme_get_smart_warning, 6338 nvme_set_smart_warning, NULL, NULL); 6339 } 6340 6341 static const TypeInfo nvme_info = { 6342 .name = TYPE_NVME, 6343 .parent = TYPE_PCI_DEVICE, 6344 .instance_size = sizeof(NvmeCtrl), 6345 .instance_init = nvme_instance_init, 6346 .class_init = nvme_class_init, 6347 .interfaces = (InterfaceInfo[]) { 6348 { INTERFACE_PCIE_DEVICE }, 6349 { } 6350 }, 6351 }; 6352 6353 static const TypeInfo nvme_bus_info = { 6354 .name = TYPE_NVME_BUS, 6355 .parent = TYPE_BUS, 6356 .instance_size = sizeof(NvmeBus), 6357 }; 6358 6359 static void nvme_register_types(void) 6360 { 6361 type_register_static(&nvme_info); 6362 type_register_static(&nvme_bus_info); 6363 } 6364 6365 type_init(nvme_register_types) 6366