1 /* 2 * QEMU NVM Express Controller 3 * 4 * Copyright (c) 2012, Intel Corporation 5 * 6 * Written by Keith Busch <keith.busch@intel.com> 7 * 8 * This code is licensed under the GNU GPL v2 or later. 9 */ 10 11 /** 12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e 13 * 14 * https://nvmexpress.org/developers/nvme-specification/ 15 * 16 * 17 * Notes on coding style 18 * --------------------- 19 * While QEMU coding style prefers lowercase hexadecimals in constants, the 20 * NVMe subsystem use thes format from the NVMe specifications in the comments 21 * (i.e. 'h' suffix instead of '0x' prefix). 22 * 23 * Usage 24 * ----- 25 * See docs/system/nvme.rst for extensive documentation. 26 * 27 * Add options: 28 * -drive file=<file>,if=none,id=<drive_id> 29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id> 30 * -device nvme,serial=<serial>,id=<bus_name>, \ 31 * cmb_size_mb=<cmb_size_mb[optional]>, \ 32 * [pmrdev=<mem_backend_file_id>,] \ 33 * max_ioqpairs=<N[optional]>, \ 34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \ 35 * mdts=<N[optional]>,vsl=<N[optional]>, \ 36 * zoned.zasl=<N[optional]>, \ 37 * zoned.auto_transition=<on|off[optional]>, \ 38 * subsys=<subsys_id> 39 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\ 40 * zoned=<true|false[optional]>, \ 41 * subsys=<subsys_id>,detached=<true|false[optional]> 42 * 43 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at 44 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the 45 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to 46 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior). 47 * 48 * Enabling pmr emulation can be achieved by pointing to memory-backend-file. 49 * For example: 50 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \ 51 * size=<size> .... -device nvme,...,pmrdev=<mem_id> 52 * 53 * The PMR will use BAR 4/5 exclusively. 54 * 55 * To place controller(s) and namespace(s) to a subsystem, then provide 56 * nvme-subsys device as above. 57 * 58 * nvme subsystem device parameters 59 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 60 * - `nqn` 61 * This parameter provides the `<nqn_id>` part of the string 62 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field 63 * of subsystem controllers. Note that `<nqn_id>` should be unique per 64 * subsystem, but this is not enforced by QEMU. If not specified, it will 65 * default to the value of the `id` parameter (`<subsys_id>`). 66 * 67 * nvme device parameters 68 * ~~~~~~~~~~~~~~~~~~~~~~ 69 * - `subsys` 70 * Specifying this parameter attaches the controller to the subsystem and 71 * the SUBNQN field in the controller will report the NQN of the subsystem 72 * device. This also enables multi controller capability represented in 73 * Identify Controller data structure in CMIC (Controller Multi-path I/O and 74 * Namesapce Sharing Capabilities). 75 * 76 * - `aerl` 77 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number 78 * of concurrently outstanding Asynchronous Event Request commands support 79 * by the controller. This is a 0's based value. 80 * 81 * - `aer_max_queued` 82 * This is the maximum number of events that the device will enqueue for 83 * completion when there are no outstanding AERs. When the maximum number of 84 * enqueued events are reached, subsequent events will be dropped. 85 * 86 * - `mdts` 87 * Indicates the maximum data transfer size for a command that transfers data 88 * between host-accessible memory and the controller. The value is specified 89 * as a power of two (2^n) and is in units of the minimum memory page size 90 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB). 91 * 92 * - `vsl` 93 * Indicates the maximum data size limit for the Verify command. Like `mdts`, 94 * this value is specified as a power of two (2^n) and is in units of the 95 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512 96 * KiB). 97 * 98 * - `zoned.zasl` 99 * Indicates the maximum data transfer size for the Zone Append command. Like 100 * `mdts`, the value is specified as a power of two (2^n) and is in units of 101 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e. 102 * defaulting to the value of `mdts`). 103 * 104 * - `zoned.auto_transition` 105 * Indicates if zones in zone state implicitly opened can be automatically 106 * transitioned to zone state closed for resource management purposes. 107 * Defaults to 'on'. 108 * 109 * nvme namespace device parameters 110 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 111 * - `shared` 112 * When the parent nvme device (as defined explicitly by the 'bus' parameter 113 * or implicitly by the most recently defined NvmeBus) is linked to an 114 * nvme-subsys device, the namespace will be attached to all controllers in 115 * the subsystem. If set to 'off' (the default), the namespace will remain a 116 * private namespace and may only be attached to a single controller at a 117 * time. 118 * 119 * - `detached` 120 * This parameter is only valid together with the `subsys` parameter. If left 121 * at the default value (`false/off`), the namespace will be attached to all 122 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the 123 * namespace will be available in the subsystem but not attached to any 124 * controllers. 125 * 126 * Setting `zoned` to true selects Zoned Command Set at the namespace. 127 * In this case, the following namespace properties are available to configure 128 * zoned operation: 129 * zoned.zone_size=<zone size in bytes, default: 128MiB> 130 * The number may be followed by K, M, G as in kilo-, mega- or giga-. 131 * 132 * zoned.zone_capacity=<zone capacity in bytes, default: zone size> 133 * The value 0 (default) forces zone capacity to be the same as zone 134 * size. The value of this property may not exceed zone size. 135 * 136 * zoned.descr_ext_size=<zone descriptor extension size, default 0> 137 * This value needs to be specified in 64B units. If it is zero, 138 * namespace(s) will not support zone descriptor extensions. 139 * 140 * zoned.max_active=<Maximum Active Resources (zones), default: 0> 141 * The default value means there is no limit to the number of 142 * concurrently active zones. 143 * 144 * zoned.max_open=<Maximum Open Resources (zones), default: 0> 145 * The default value means there is no limit to the number of 146 * concurrently open zones. 147 * 148 * zoned.cross_read=<enable RAZB, default: false> 149 * Setting this property to true enables Read Across Zone Boundaries. 150 */ 151 152 #include "qemu/osdep.h" 153 #include "qemu/cutils.h" 154 #include "qemu/error-report.h" 155 #include "qemu/log.h" 156 #include "qemu/units.h" 157 #include "qapi/error.h" 158 #include "qapi/visitor.h" 159 #include "sysemu/sysemu.h" 160 #include "sysemu/block-backend.h" 161 #include "sysemu/hostmem.h" 162 #include "hw/pci/msix.h" 163 #include "migration/vmstate.h" 164 165 #include "nvme.h" 166 #include "trace.h" 167 168 #define NVME_MAX_IOQPAIRS 0xffff 169 #define NVME_DB_SIZE 4 170 #define NVME_SPEC_VER 0x00010400 171 #define NVME_CMB_BIR 2 172 #define NVME_PMR_BIR 4 173 #define NVME_TEMPERATURE 0x143 174 #define NVME_TEMPERATURE_WARNING 0x157 175 #define NVME_TEMPERATURE_CRITICAL 0x175 176 #define NVME_NUM_FW_SLOTS 1 177 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB) 178 179 #define NVME_GUEST_ERR(trace, fmt, ...) \ 180 do { \ 181 (trace_##trace)(__VA_ARGS__); \ 182 qemu_log_mask(LOG_GUEST_ERROR, #trace \ 183 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \ 184 } while (0) 185 186 static const bool nvme_feature_support[NVME_FID_MAX] = { 187 [NVME_ARBITRATION] = true, 188 [NVME_POWER_MANAGEMENT] = true, 189 [NVME_TEMPERATURE_THRESHOLD] = true, 190 [NVME_ERROR_RECOVERY] = true, 191 [NVME_VOLATILE_WRITE_CACHE] = true, 192 [NVME_NUMBER_OF_QUEUES] = true, 193 [NVME_INTERRUPT_COALESCING] = true, 194 [NVME_INTERRUPT_VECTOR_CONF] = true, 195 [NVME_WRITE_ATOMICITY] = true, 196 [NVME_ASYNCHRONOUS_EVENT_CONF] = true, 197 [NVME_TIMESTAMP] = true, 198 [NVME_COMMAND_SET_PROFILE] = true, 199 }; 200 201 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = { 202 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE, 203 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS, 204 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE, 205 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE, 206 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE, 207 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE, 208 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE, 209 }; 210 211 static const uint32_t nvme_cse_acs[256] = { 212 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP, 213 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP, 214 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP, 215 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP, 216 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP, 217 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP, 218 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP, 219 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP, 220 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP, 221 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, 222 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC, 223 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 224 }; 225 226 static const uint32_t nvme_cse_iocs_none[256]; 227 228 static const uint32_t nvme_cse_iocs_nvm[256] = { 229 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 230 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 231 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 232 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, 233 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 234 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP, 235 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 236 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, 237 }; 238 239 static const uint32_t nvme_cse_iocs_zoned[256] = { 240 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 241 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 242 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 243 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, 244 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 245 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP, 246 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 247 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, 248 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 249 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, 250 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP, 251 }; 252 253 static void nvme_process_sq(void *opaque); 254 255 static uint16_t nvme_sqid(NvmeRequest *req) 256 { 257 return le16_to_cpu(req->sq->sqid); 258 } 259 260 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone, 261 NvmeZoneState state) 262 { 263 if (QTAILQ_IN_USE(zone, entry)) { 264 switch (nvme_get_zone_state(zone)) { 265 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 266 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry); 267 break; 268 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 269 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); 270 break; 271 case NVME_ZONE_STATE_CLOSED: 272 QTAILQ_REMOVE(&ns->closed_zones, zone, entry); 273 break; 274 case NVME_ZONE_STATE_FULL: 275 QTAILQ_REMOVE(&ns->full_zones, zone, entry); 276 default: 277 ; 278 } 279 } 280 281 nvme_set_zone_state(zone, state); 282 283 switch (state) { 284 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 285 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry); 286 break; 287 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 288 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry); 289 break; 290 case NVME_ZONE_STATE_CLOSED: 291 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry); 292 break; 293 case NVME_ZONE_STATE_FULL: 294 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry); 295 case NVME_ZONE_STATE_READ_ONLY: 296 break; 297 default: 298 zone->d.za = 0; 299 } 300 } 301 302 /* 303 * Check if we can open a zone without exceeding open/active limits. 304 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5). 305 */ 306 static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn) 307 { 308 if (ns->params.max_active_zones != 0 && 309 ns->nr_active_zones + act > ns->params.max_active_zones) { 310 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones); 311 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR; 312 } 313 if (ns->params.max_open_zones != 0 && 314 ns->nr_open_zones + opn > ns->params.max_open_zones) { 315 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones); 316 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR; 317 } 318 319 return NVME_SUCCESS; 320 } 321 322 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) 323 { 324 hwaddr hi, lo; 325 326 if (!n->cmb.cmse) { 327 return false; 328 } 329 330 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba; 331 hi = lo + int128_get64(n->cmb.mem.size); 332 333 return addr >= lo && addr < hi; 334 } 335 336 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) 337 { 338 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba; 339 return &n->cmb.buf[addr - base]; 340 } 341 342 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr) 343 { 344 hwaddr hi; 345 346 if (!n->pmr.cmse) { 347 return false; 348 } 349 350 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size); 351 352 return addr >= n->pmr.cba && addr < hi; 353 } 354 355 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr) 356 { 357 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba); 358 } 359 360 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) 361 { 362 hwaddr hi = addr + size - 1; 363 if (hi < addr) { 364 return 1; 365 } 366 367 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) { 368 memcpy(buf, nvme_addr_to_cmb(n, addr), size); 369 return 0; 370 } 371 372 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) { 373 memcpy(buf, nvme_addr_to_pmr(n, addr), size); 374 return 0; 375 } 376 377 return pci_dma_read(&n->parent_obj, addr, buf, size); 378 } 379 380 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size) 381 { 382 hwaddr hi = addr + size - 1; 383 if (hi < addr) { 384 return 1; 385 } 386 387 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) { 388 memcpy(nvme_addr_to_cmb(n, addr), buf, size); 389 return 0; 390 } 391 392 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) { 393 memcpy(nvme_addr_to_pmr(n, addr), buf, size); 394 return 0; 395 } 396 397 return pci_dma_write(&n->parent_obj, addr, buf, size); 398 } 399 400 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid) 401 { 402 return nsid && 403 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES); 404 } 405 406 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid) 407 { 408 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1; 409 } 410 411 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid) 412 { 413 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1; 414 } 415 416 static void nvme_inc_cq_tail(NvmeCQueue *cq) 417 { 418 cq->tail++; 419 if (cq->tail >= cq->size) { 420 cq->tail = 0; 421 cq->phase = !cq->phase; 422 } 423 } 424 425 static void nvme_inc_sq_head(NvmeSQueue *sq) 426 { 427 sq->head = (sq->head + 1) % sq->size; 428 } 429 430 static uint8_t nvme_cq_full(NvmeCQueue *cq) 431 { 432 return (cq->tail + 1) % cq->size == cq->head; 433 } 434 435 static uint8_t nvme_sq_empty(NvmeSQueue *sq) 436 { 437 return sq->head == sq->tail; 438 } 439 440 static void nvme_irq_check(NvmeCtrl *n) 441 { 442 uint32_t intms = ldl_le_p(&n->bar.intms); 443 444 if (msix_enabled(&(n->parent_obj))) { 445 return; 446 } 447 if (~intms & n->irq_status) { 448 pci_irq_assert(&n->parent_obj); 449 } else { 450 pci_irq_deassert(&n->parent_obj); 451 } 452 } 453 454 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq) 455 { 456 if (cq->irq_enabled) { 457 if (msix_enabled(&(n->parent_obj))) { 458 trace_pci_nvme_irq_msix(cq->vector); 459 msix_notify(&(n->parent_obj), cq->vector); 460 } else { 461 trace_pci_nvme_irq_pin(); 462 assert(cq->vector < 32); 463 n->irq_status |= 1 << cq->vector; 464 nvme_irq_check(n); 465 } 466 } else { 467 trace_pci_nvme_irq_masked(); 468 } 469 } 470 471 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq) 472 { 473 if (cq->irq_enabled) { 474 if (msix_enabled(&(n->parent_obj))) { 475 return; 476 } else { 477 assert(cq->vector < 32); 478 if (!n->cq_pending) { 479 n->irq_status &= ~(1 << cq->vector); 480 } 481 nvme_irq_check(n); 482 } 483 } 484 } 485 486 static void nvme_req_clear(NvmeRequest *req) 487 { 488 req->ns = NULL; 489 req->opaque = NULL; 490 req->aiocb = NULL; 491 memset(&req->cqe, 0x0, sizeof(req->cqe)); 492 req->status = NVME_SUCCESS; 493 } 494 495 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma) 496 { 497 if (dma) { 498 pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0); 499 sg->flags = NVME_SG_DMA; 500 } else { 501 qemu_iovec_init(&sg->iov, 0); 502 } 503 504 sg->flags |= NVME_SG_ALLOC; 505 } 506 507 static inline void nvme_sg_unmap(NvmeSg *sg) 508 { 509 if (!(sg->flags & NVME_SG_ALLOC)) { 510 return; 511 } 512 513 if (sg->flags & NVME_SG_DMA) { 514 qemu_sglist_destroy(&sg->qsg); 515 } else { 516 qemu_iovec_destroy(&sg->iov); 517 } 518 519 memset(sg, 0x0, sizeof(*sg)); 520 } 521 522 /* 523 * When metadata is transfered as extended LBAs, the DPTR mapped into `sg` 524 * holds both data and metadata. This function splits the data and metadata 525 * into two separate QSG/IOVs. 526 */ 527 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data, 528 NvmeSg *mdata) 529 { 530 NvmeSg *dst = data; 531 uint32_t trans_len, count = ns->lbasz; 532 uint64_t offset = 0; 533 bool dma = sg->flags & NVME_SG_DMA; 534 size_t sge_len; 535 size_t sg_len = dma ? sg->qsg.size : sg->iov.size; 536 int sg_idx = 0; 537 538 assert(sg->flags & NVME_SG_ALLOC); 539 540 while (sg_len) { 541 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len; 542 543 trans_len = MIN(sg_len, count); 544 trans_len = MIN(trans_len, sge_len - offset); 545 546 if (dst) { 547 if (dma) { 548 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset, 549 trans_len); 550 } else { 551 qemu_iovec_add(&dst->iov, 552 sg->iov.iov[sg_idx].iov_base + offset, 553 trans_len); 554 } 555 } 556 557 sg_len -= trans_len; 558 count -= trans_len; 559 offset += trans_len; 560 561 if (count == 0) { 562 dst = (dst == data) ? mdata : data; 563 count = (dst == data) ? ns->lbasz : ns->lbaf.ms; 564 } 565 566 if (sge_len == offset) { 567 offset = 0; 568 sg_idx++; 569 } 570 } 571 } 572 573 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, 574 size_t len) 575 { 576 if (!len) { 577 return NVME_SUCCESS; 578 } 579 580 trace_pci_nvme_map_addr_cmb(addr, len); 581 582 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) { 583 return NVME_DATA_TRAS_ERROR; 584 } 585 586 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len); 587 588 return NVME_SUCCESS; 589 } 590 591 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, 592 size_t len) 593 { 594 if (!len) { 595 return NVME_SUCCESS; 596 } 597 598 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) { 599 return NVME_DATA_TRAS_ERROR; 600 } 601 602 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len); 603 604 return NVME_SUCCESS; 605 } 606 607 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len) 608 { 609 bool cmb = false, pmr = false; 610 611 if (!len) { 612 return NVME_SUCCESS; 613 } 614 615 trace_pci_nvme_map_addr(addr, len); 616 617 if (nvme_addr_is_cmb(n, addr)) { 618 cmb = true; 619 } else if (nvme_addr_is_pmr(n, addr)) { 620 pmr = true; 621 } 622 623 if (cmb || pmr) { 624 if (sg->flags & NVME_SG_DMA) { 625 return NVME_INVALID_USE_OF_CMB | NVME_DNR; 626 } 627 628 if (sg->iov.niov + 1 > IOV_MAX) { 629 goto max_mappings_exceeded; 630 } 631 632 if (cmb) { 633 return nvme_map_addr_cmb(n, &sg->iov, addr, len); 634 } else { 635 return nvme_map_addr_pmr(n, &sg->iov, addr, len); 636 } 637 } 638 639 if (!(sg->flags & NVME_SG_DMA)) { 640 return NVME_INVALID_USE_OF_CMB | NVME_DNR; 641 } 642 643 if (sg->qsg.nsg + 1 > IOV_MAX) { 644 goto max_mappings_exceeded; 645 } 646 647 qemu_sglist_add(&sg->qsg, addr, len); 648 649 return NVME_SUCCESS; 650 651 max_mappings_exceeded: 652 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings, 653 "number of mappings exceed 1024"); 654 return NVME_INTERNAL_DEV_ERROR | NVME_DNR; 655 } 656 657 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr) 658 { 659 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr)); 660 } 661 662 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1, 663 uint64_t prp2, uint32_t len) 664 { 665 hwaddr trans_len = n->page_size - (prp1 % n->page_size); 666 trans_len = MIN(len, trans_len); 667 int num_prps = (len >> n->page_bits) + 1; 668 uint16_t status; 669 int ret; 670 671 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); 672 673 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1)); 674 675 status = nvme_map_addr(n, sg, prp1, trans_len); 676 if (status) { 677 goto unmap; 678 } 679 680 len -= trans_len; 681 if (len) { 682 if (len > n->page_size) { 683 uint64_t prp_list[n->max_prp_ents]; 684 uint32_t nents, prp_trans; 685 int i = 0; 686 687 /* 688 * The first PRP list entry, pointed to by PRP2 may contain offset. 689 * Hence, we need to calculate the number of entries in based on 690 * that offset. 691 */ 692 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3; 693 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); 694 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); 695 if (ret) { 696 trace_pci_nvme_err_addr_read(prp2); 697 status = NVME_DATA_TRAS_ERROR; 698 goto unmap; 699 } 700 while (len != 0) { 701 uint64_t prp_ent = le64_to_cpu(prp_list[i]); 702 703 if (i == nents - 1 && len > n->page_size) { 704 if (unlikely(prp_ent & (n->page_size - 1))) { 705 trace_pci_nvme_err_invalid_prplist_ent(prp_ent); 706 status = NVME_INVALID_PRP_OFFSET | NVME_DNR; 707 goto unmap; 708 } 709 710 i = 0; 711 nents = (len + n->page_size - 1) >> n->page_bits; 712 nents = MIN(nents, n->max_prp_ents); 713 prp_trans = nents * sizeof(uint64_t); 714 ret = nvme_addr_read(n, prp_ent, (void *)prp_list, 715 prp_trans); 716 if (ret) { 717 trace_pci_nvme_err_addr_read(prp_ent); 718 status = NVME_DATA_TRAS_ERROR; 719 goto unmap; 720 } 721 prp_ent = le64_to_cpu(prp_list[i]); 722 } 723 724 if (unlikely(prp_ent & (n->page_size - 1))) { 725 trace_pci_nvme_err_invalid_prplist_ent(prp_ent); 726 status = NVME_INVALID_PRP_OFFSET | NVME_DNR; 727 goto unmap; 728 } 729 730 trans_len = MIN(len, n->page_size); 731 status = nvme_map_addr(n, sg, prp_ent, trans_len); 732 if (status) { 733 goto unmap; 734 } 735 736 len -= trans_len; 737 i++; 738 } 739 } else { 740 if (unlikely(prp2 & (n->page_size - 1))) { 741 trace_pci_nvme_err_invalid_prp2_align(prp2); 742 status = NVME_INVALID_PRP_OFFSET | NVME_DNR; 743 goto unmap; 744 } 745 status = nvme_map_addr(n, sg, prp2, len); 746 if (status) { 747 goto unmap; 748 } 749 } 750 } 751 752 return NVME_SUCCESS; 753 754 unmap: 755 nvme_sg_unmap(sg); 756 return status; 757 } 758 759 /* 760 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the 761 * number of bytes mapped in len. 762 */ 763 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg, 764 NvmeSglDescriptor *segment, uint64_t nsgld, 765 size_t *len, NvmeCmd *cmd) 766 { 767 dma_addr_t addr, trans_len; 768 uint32_t dlen; 769 uint16_t status; 770 771 for (int i = 0; i < nsgld; i++) { 772 uint8_t type = NVME_SGL_TYPE(segment[i].type); 773 774 switch (type) { 775 case NVME_SGL_DESCR_TYPE_BIT_BUCKET: 776 if (cmd->opcode == NVME_CMD_WRITE) { 777 continue; 778 } 779 case NVME_SGL_DESCR_TYPE_DATA_BLOCK: 780 break; 781 case NVME_SGL_DESCR_TYPE_SEGMENT: 782 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: 783 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR; 784 default: 785 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR; 786 } 787 788 dlen = le32_to_cpu(segment[i].len); 789 790 if (!dlen) { 791 continue; 792 } 793 794 if (*len == 0) { 795 /* 796 * All data has been mapped, but the SGL contains additional 797 * segments and/or descriptors. The controller might accept 798 * ignoring the rest of the SGL. 799 */ 800 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls); 801 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) { 802 break; 803 } 804 805 trace_pci_nvme_err_invalid_sgl_excess_length(dlen); 806 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 807 } 808 809 trans_len = MIN(*len, dlen); 810 811 if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) { 812 goto next; 813 } 814 815 addr = le64_to_cpu(segment[i].addr); 816 817 if (UINT64_MAX - addr < dlen) { 818 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 819 } 820 821 status = nvme_map_addr(n, sg, addr, trans_len); 822 if (status) { 823 return status; 824 } 825 826 next: 827 *len -= trans_len; 828 } 829 830 return NVME_SUCCESS; 831 } 832 833 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, 834 size_t len, NvmeCmd *cmd) 835 { 836 /* 837 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid 838 * dynamically allocating a potentially huge SGL. The spec allows the SGL 839 * to be larger (as in number of bytes required to describe the SGL 840 * descriptors and segment chain) than the command transfer size, so it is 841 * not bounded by MDTS. 842 */ 843 const int SEG_CHUNK_SIZE = 256; 844 845 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld; 846 uint64_t nsgld; 847 uint32_t seg_len; 848 uint16_t status; 849 hwaddr addr; 850 int ret; 851 852 sgld = &sgl; 853 addr = le64_to_cpu(sgl.addr); 854 855 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len); 856 857 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr)); 858 859 /* 860 * If the entire transfer can be described with a single data block it can 861 * be mapped directly. 862 */ 863 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { 864 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd); 865 if (status) { 866 goto unmap; 867 } 868 869 goto out; 870 } 871 872 for (;;) { 873 switch (NVME_SGL_TYPE(sgld->type)) { 874 case NVME_SGL_DESCR_TYPE_SEGMENT: 875 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: 876 break; 877 default: 878 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; 879 } 880 881 seg_len = le32_to_cpu(sgld->len); 882 883 /* check the length of the (Last) Segment descriptor */ 884 if ((!seg_len || seg_len & 0xf) && 885 (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) { 886 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; 887 } 888 889 if (UINT64_MAX - addr < seg_len) { 890 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 891 } 892 893 nsgld = seg_len / sizeof(NvmeSglDescriptor); 894 895 while (nsgld > SEG_CHUNK_SIZE) { 896 if (nvme_addr_read(n, addr, segment, sizeof(segment))) { 897 trace_pci_nvme_err_addr_read(addr); 898 status = NVME_DATA_TRAS_ERROR; 899 goto unmap; 900 } 901 902 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE, 903 &len, cmd); 904 if (status) { 905 goto unmap; 906 } 907 908 nsgld -= SEG_CHUNK_SIZE; 909 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor); 910 } 911 912 ret = nvme_addr_read(n, addr, segment, nsgld * 913 sizeof(NvmeSglDescriptor)); 914 if (ret) { 915 trace_pci_nvme_err_addr_read(addr); 916 status = NVME_DATA_TRAS_ERROR; 917 goto unmap; 918 } 919 920 last_sgld = &segment[nsgld - 1]; 921 922 /* 923 * If the segment ends with a Data Block or Bit Bucket Descriptor Type, 924 * then we are done. 925 */ 926 switch (NVME_SGL_TYPE(last_sgld->type)) { 927 case NVME_SGL_DESCR_TYPE_DATA_BLOCK: 928 case NVME_SGL_DESCR_TYPE_BIT_BUCKET: 929 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd); 930 if (status) { 931 goto unmap; 932 } 933 934 goto out; 935 936 default: 937 break; 938 } 939 940 /* 941 * If the last descriptor was not a Data Block or Bit Bucket, then the 942 * current segment must not be a Last Segment. 943 */ 944 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) { 945 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; 946 goto unmap; 947 } 948 949 sgld = last_sgld; 950 addr = le64_to_cpu(sgld->addr); 951 952 /* 953 * Do not map the last descriptor; it will be a Segment or Last Segment 954 * descriptor and is handled by the next iteration. 955 */ 956 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd); 957 if (status) { 958 goto unmap; 959 } 960 } 961 962 out: 963 /* if there is any residual left in len, the SGL was too short */ 964 if (len) { 965 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR; 966 goto unmap; 967 } 968 969 return NVME_SUCCESS; 970 971 unmap: 972 nvme_sg_unmap(sg); 973 return status; 974 } 975 976 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len, 977 NvmeCmd *cmd) 978 { 979 uint64_t prp1, prp2; 980 981 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) { 982 case NVME_PSDT_PRP: 983 prp1 = le64_to_cpu(cmd->dptr.prp1); 984 prp2 = le64_to_cpu(cmd->dptr.prp2); 985 986 return nvme_map_prp(n, sg, prp1, prp2, len); 987 case NVME_PSDT_SGL_MPTR_CONTIGUOUS: 988 case NVME_PSDT_SGL_MPTR_SGL: 989 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd); 990 default: 991 return NVME_INVALID_FIELD; 992 } 993 } 994 995 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len, 996 NvmeCmd *cmd) 997 { 998 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags); 999 hwaddr mptr = le64_to_cpu(cmd->mptr); 1000 uint16_t status; 1001 1002 if (psdt == NVME_PSDT_SGL_MPTR_SGL) { 1003 NvmeSglDescriptor sgl; 1004 1005 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) { 1006 return NVME_DATA_TRAS_ERROR; 1007 } 1008 1009 status = nvme_map_sgl(n, sg, sgl, len, cmd); 1010 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) { 1011 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR; 1012 } 1013 1014 return status; 1015 } 1016 1017 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr)); 1018 status = nvme_map_addr(n, sg, mptr, len); 1019 if (status) { 1020 nvme_sg_unmap(sg); 1021 } 1022 1023 return status; 1024 } 1025 1026 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req) 1027 { 1028 NvmeNamespace *ns = req->ns; 1029 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1030 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps); 1031 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT); 1032 size_t len = nvme_l2b(ns, nlb); 1033 uint16_t status; 1034 1035 if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) { 1036 NvmeSg sg; 1037 1038 len += nvme_m2b(ns, nlb); 1039 1040 status = nvme_map_dptr(n, &sg, len, &req->cmd); 1041 if (status) { 1042 return status; 1043 } 1044 1045 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA); 1046 nvme_sg_split(&sg, ns, &req->sg, NULL); 1047 nvme_sg_unmap(&sg); 1048 1049 return NVME_SUCCESS; 1050 } 1051 1052 return nvme_map_dptr(n, &req->sg, len, &req->cmd); 1053 } 1054 1055 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req) 1056 { 1057 NvmeNamespace *ns = req->ns; 1058 size_t len = nvme_m2b(ns, nlb); 1059 uint16_t status; 1060 1061 if (nvme_ns_ext(ns)) { 1062 NvmeSg sg; 1063 1064 len += nvme_l2b(ns, nlb); 1065 1066 status = nvme_map_dptr(n, &sg, len, &req->cmd); 1067 if (status) { 1068 return status; 1069 } 1070 1071 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA); 1072 nvme_sg_split(&sg, ns, NULL, &req->sg); 1073 nvme_sg_unmap(&sg); 1074 1075 return NVME_SUCCESS; 1076 } 1077 1078 return nvme_map_mptr(n, &req->sg, len, &req->cmd); 1079 } 1080 1081 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, 1082 uint32_t len, uint32_t bytes, 1083 int32_t skip_bytes, int64_t offset, 1084 NvmeTxDirection dir) 1085 { 1086 hwaddr addr; 1087 uint32_t trans_len, count = bytes; 1088 bool dma = sg->flags & NVME_SG_DMA; 1089 int64_t sge_len; 1090 int sg_idx = 0; 1091 int ret; 1092 1093 assert(sg->flags & NVME_SG_ALLOC); 1094 1095 while (len) { 1096 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len; 1097 1098 if (sge_len - offset < 0) { 1099 offset -= sge_len; 1100 sg_idx++; 1101 continue; 1102 } 1103 1104 if (sge_len == offset) { 1105 offset = 0; 1106 sg_idx++; 1107 continue; 1108 } 1109 1110 trans_len = MIN(len, count); 1111 trans_len = MIN(trans_len, sge_len - offset); 1112 1113 if (dma) { 1114 addr = sg->qsg.sg[sg_idx].base + offset; 1115 } else { 1116 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset; 1117 } 1118 1119 if (dir == NVME_TX_DIRECTION_TO_DEVICE) { 1120 ret = nvme_addr_read(n, addr, ptr, trans_len); 1121 } else { 1122 ret = nvme_addr_write(n, addr, ptr, trans_len); 1123 } 1124 1125 if (ret) { 1126 return NVME_DATA_TRAS_ERROR; 1127 } 1128 1129 ptr += trans_len; 1130 len -= trans_len; 1131 count -= trans_len; 1132 offset += trans_len; 1133 1134 if (count == 0) { 1135 count = bytes; 1136 offset += skip_bytes; 1137 } 1138 } 1139 1140 return NVME_SUCCESS; 1141 } 1142 1143 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len, 1144 NvmeTxDirection dir) 1145 { 1146 assert(sg->flags & NVME_SG_ALLOC); 1147 1148 if (sg->flags & NVME_SG_DMA) { 1149 const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED; 1150 uint64_t residual; 1151 1152 if (dir == NVME_TX_DIRECTION_TO_DEVICE) { 1153 residual = dma_buf_write(ptr, len, &sg->qsg, attrs); 1154 } else { 1155 residual = dma_buf_read(ptr, len, &sg->qsg, attrs); 1156 } 1157 1158 if (unlikely(residual)) { 1159 trace_pci_nvme_err_invalid_dma(); 1160 return NVME_INVALID_FIELD | NVME_DNR; 1161 } 1162 } else { 1163 size_t bytes; 1164 1165 if (dir == NVME_TX_DIRECTION_TO_DEVICE) { 1166 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len); 1167 } else { 1168 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len); 1169 } 1170 1171 if (unlikely(bytes != len)) { 1172 trace_pci_nvme_err_invalid_dma(); 1173 return NVME_INVALID_FIELD | NVME_DNR; 1174 } 1175 } 1176 1177 return NVME_SUCCESS; 1178 } 1179 1180 static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len, 1181 NvmeRequest *req) 1182 { 1183 uint16_t status; 1184 1185 status = nvme_map_dptr(n, &req->sg, len, &req->cmd); 1186 if (status) { 1187 return status; 1188 } 1189 1190 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE); 1191 } 1192 1193 static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len, 1194 NvmeRequest *req) 1195 { 1196 uint16_t status; 1197 1198 status = nvme_map_dptr(n, &req->sg, len, &req->cmd); 1199 if (status) { 1200 return status; 1201 } 1202 1203 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE); 1204 } 1205 1206 uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len, 1207 NvmeTxDirection dir, NvmeRequest *req) 1208 { 1209 NvmeNamespace *ns = req->ns; 1210 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1211 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps); 1212 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT); 1213 1214 if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) { 1215 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz, 1216 ns->lbaf.ms, 0, dir); 1217 } 1218 1219 return nvme_tx(n, &req->sg, ptr, len, dir); 1220 } 1221 1222 uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len, 1223 NvmeTxDirection dir, NvmeRequest *req) 1224 { 1225 NvmeNamespace *ns = req->ns; 1226 uint16_t status; 1227 1228 if (nvme_ns_ext(ns)) { 1229 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms, 1230 ns->lbasz, ns->lbasz, dir); 1231 } 1232 1233 nvme_sg_unmap(&req->sg); 1234 1235 status = nvme_map_mptr(n, &req->sg, len, &req->cmd); 1236 if (status) { 1237 return status; 1238 } 1239 1240 return nvme_tx(n, &req->sg, ptr, len, dir); 1241 } 1242 1243 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset, 1244 BlockCompletionFunc *cb, NvmeRequest *req) 1245 { 1246 assert(req->sg.flags & NVME_SG_ALLOC); 1247 1248 if (req->sg.flags & NVME_SG_DMA) { 1249 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE, 1250 cb, req); 1251 } else { 1252 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req); 1253 } 1254 } 1255 1256 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset, 1257 BlockCompletionFunc *cb, NvmeRequest *req) 1258 { 1259 assert(req->sg.flags & NVME_SG_ALLOC); 1260 1261 if (req->sg.flags & NVME_SG_DMA) { 1262 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE, 1263 cb, req); 1264 } else { 1265 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req); 1266 } 1267 } 1268 1269 static void nvme_post_cqes(void *opaque) 1270 { 1271 NvmeCQueue *cq = opaque; 1272 NvmeCtrl *n = cq->ctrl; 1273 NvmeRequest *req, *next; 1274 bool pending = cq->head != cq->tail; 1275 int ret; 1276 1277 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) { 1278 NvmeSQueue *sq; 1279 hwaddr addr; 1280 1281 if (nvme_cq_full(cq)) { 1282 break; 1283 } 1284 1285 sq = req->sq; 1286 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase); 1287 req->cqe.sq_id = cpu_to_le16(sq->sqid); 1288 req->cqe.sq_head = cpu_to_le16(sq->head); 1289 addr = cq->dma_addr + cq->tail * n->cqe_size; 1290 ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe, 1291 sizeof(req->cqe)); 1292 if (ret) { 1293 trace_pci_nvme_err_addr_write(addr); 1294 trace_pci_nvme_err_cfs(); 1295 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED); 1296 break; 1297 } 1298 QTAILQ_REMOVE(&cq->req_list, req, entry); 1299 nvme_inc_cq_tail(cq); 1300 nvme_sg_unmap(&req->sg); 1301 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); 1302 } 1303 if (cq->tail != cq->head) { 1304 if (cq->irq_enabled && !pending) { 1305 n->cq_pending++; 1306 } 1307 1308 nvme_irq_assert(n, cq); 1309 } 1310 } 1311 1312 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) 1313 { 1314 assert(cq->cqid == req->sq->cqid); 1315 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid, 1316 le32_to_cpu(req->cqe.result), 1317 le32_to_cpu(req->cqe.dw1), 1318 req->status); 1319 1320 if (req->status) { 1321 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns), 1322 req->status, req->cmd.opcode); 1323 } 1324 1325 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); 1326 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); 1327 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); 1328 } 1329 1330 static void nvme_process_aers(void *opaque) 1331 { 1332 NvmeCtrl *n = opaque; 1333 NvmeAsyncEvent *event, *next; 1334 1335 trace_pci_nvme_process_aers(n->aer_queued); 1336 1337 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) { 1338 NvmeRequest *req; 1339 NvmeAerResult *result; 1340 1341 /* can't post cqe if there is nothing to complete */ 1342 if (!n->outstanding_aers) { 1343 trace_pci_nvme_no_outstanding_aers(); 1344 break; 1345 } 1346 1347 /* ignore if masked (cqe posted, but event not cleared) */ 1348 if (n->aer_mask & (1 << event->result.event_type)) { 1349 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask); 1350 continue; 1351 } 1352 1353 QTAILQ_REMOVE(&n->aer_queue, event, entry); 1354 n->aer_queued--; 1355 1356 n->aer_mask |= 1 << event->result.event_type; 1357 n->outstanding_aers--; 1358 1359 req = n->aer_reqs[n->outstanding_aers]; 1360 1361 result = (NvmeAerResult *) &req->cqe.result; 1362 result->event_type = event->result.event_type; 1363 result->event_info = event->result.event_info; 1364 result->log_page = event->result.log_page; 1365 g_free(event); 1366 1367 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info, 1368 result->log_page); 1369 1370 nvme_enqueue_req_completion(&n->admin_cq, req); 1371 } 1372 } 1373 1374 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type, 1375 uint8_t event_info, uint8_t log_page) 1376 { 1377 NvmeAsyncEvent *event; 1378 1379 trace_pci_nvme_enqueue_event(event_type, event_info, log_page); 1380 1381 if (n->aer_queued == n->params.aer_max_queued) { 1382 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued); 1383 return; 1384 } 1385 1386 event = g_new(NvmeAsyncEvent, 1); 1387 event->result = (NvmeAerResult) { 1388 .event_type = event_type, 1389 .event_info = event_info, 1390 .log_page = log_page, 1391 }; 1392 1393 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry); 1394 n->aer_queued++; 1395 1396 nvme_process_aers(n); 1397 } 1398 1399 static void nvme_smart_event(NvmeCtrl *n, uint8_t event) 1400 { 1401 uint8_t aer_info; 1402 1403 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */ 1404 if (!(NVME_AEC_SMART(n->features.async_config) & event)) { 1405 return; 1406 } 1407 1408 switch (event) { 1409 case NVME_SMART_SPARE: 1410 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH; 1411 break; 1412 case NVME_SMART_TEMPERATURE: 1413 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH; 1414 break; 1415 case NVME_SMART_RELIABILITY: 1416 case NVME_SMART_MEDIA_READ_ONLY: 1417 case NVME_SMART_FAILED_VOLATILE_MEDIA: 1418 case NVME_SMART_PMR_UNRELIABLE: 1419 aer_info = NVME_AER_INFO_SMART_RELIABILITY; 1420 break; 1421 default: 1422 return; 1423 } 1424 1425 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO); 1426 } 1427 1428 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type) 1429 { 1430 n->aer_mask &= ~(1 << event_type); 1431 if (!QTAILQ_EMPTY(&n->aer_queue)) { 1432 nvme_process_aers(n); 1433 } 1434 } 1435 1436 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len) 1437 { 1438 uint8_t mdts = n->params.mdts; 1439 1440 if (mdts && len > n->page_size << mdts) { 1441 trace_pci_nvme_err_mdts(len); 1442 return NVME_INVALID_FIELD | NVME_DNR; 1443 } 1444 1445 return NVME_SUCCESS; 1446 } 1447 1448 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba, 1449 uint32_t nlb) 1450 { 1451 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze); 1452 1453 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) { 1454 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze); 1455 return NVME_LBA_RANGE | NVME_DNR; 1456 } 1457 1458 return NVME_SUCCESS; 1459 } 1460 1461 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba, 1462 uint32_t nlb, int flags) 1463 { 1464 BlockDriverState *bs = blk_bs(ns->blkconf.blk); 1465 1466 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb); 1467 int64_t offset = nvme_l2b(ns, slba); 1468 int ret; 1469 1470 /* 1471 * `pnum` holds the number of bytes after offset that shares the same 1472 * allocation status as the byte at offset. If `pnum` is different from 1473 * `bytes`, we should check the allocation status of the next range and 1474 * continue this until all bytes have been checked. 1475 */ 1476 do { 1477 bytes -= pnum; 1478 1479 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL); 1480 if (ret < 0) { 1481 return ret; 1482 } 1483 1484 1485 trace_pci_nvme_block_status(offset, bytes, pnum, ret, 1486 !!(ret & BDRV_BLOCK_ZERO)); 1487 1488 if (!(ret & flags)) { 1489 return 1; 1490 } 1491 1492 offset += pnum; 1493 } while (pnum != bytes); 1494 1495 return 0; 1496 } 1497 1498 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba, 1499 uint32_t nlb) 1500 { 1501 int ret; 1502 Error *err = NULL; 1503 1504 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA); 1505 if (ret) { 1506 if (ret < 0) { 1507 error_setg_errno(&err, -ret, "unable to get block status"); 1508 error_report_err(err); 1509 1510 return NVME_INTERNAL_DEV_ERROR; 1511 } 1512 1513 return NVME_DULB; 1514 } 1515 1516 return NVME_SUCCESS; 1517 } 1518 1519 static void nvme_aio_err(NvmeRequest *req, int ret) 1520 { 1521 uint16_t status = NVME_SUCCESS; 1522 Error *local_err = NULL; 1523 1524 switch (req->cmd.opcode) { 1525 case NVME_CMD_READ: 1526 status = NVME_UNRECOVERED_READ; 1527 break; 1528 case NVME_CMD_FLUSH: 1529 case NVME_CMD_WRITE: 1530 case NVME_CMD_WRITE_ZEROES: 1531 case NVME_CMD_ZONE_APPEND: 1532 status = NVME_WRITE_FAULT; 1533 break; 1534 default: 1535 status = NVME_INTERNAL_DEV_ERROR; 1536 break; 1537 } 1538 1539 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status); 1540 1541 error_setg_errno(&local_err, -ret, "aio failed"); 1542 error_report_err(local_err); 1543 1544 /* 1545 * Set the command status code to the first encountered error but allow a 1546 * subsequent Internal Device Error to trump it. 1547 */ 1548 if (req->status && status != NVME_INTERNAL_DEV_ERROR) { 1549 return; 1550 } 1551 1552 req->status = status; 1553 } 1554 1555 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba) 1556 { 1557 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 : 1558 slba / ns->zone_size; 1559 } 1560 1561 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba) 1562 { 1563 uint32_t zone_idx = nvme_zone_idx(ns, slba); 1564 1565 if (zone_idx >= ns->num_zones) { 1566 return NULL; 1567 } 1568 1569 return &ns->zone_array[zone_idx]; 1570 } 1571 1572 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone) 1573 { 1574 uint64_t zslba = zone->d.zslba; 1575 1576 switch (nvme_get_zone_state(zone)) { 1577 case NVME_ZONE_STATE_EMPTY: 1578 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1579 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1580 case NVME_ZONE_STATE_CLOSED: 1581 return NVME_SUCCESS; 1582 case NVME_ZONE_STATE_FULL: 1583 trace_pci_nvme_err_zone_is_full(zslba); 1584 return NVME_ZONE_FULL; 1585 case NVME_ZONE_STATE_OFFLINE: 1586 trace_pci_nvme_err_zone_is_offline(zslba); 1587 return NVME_ZONE_OFFLINE; 1588 case NVME_ZONE_STATE_READ_ONLY: 1589 trace_pci_nvme_err_zone_is_read_only(zslba); 1590 return NVME_ZONE_READ_ONLY; 1591 default: 1592 assert(false); 1593 } 1594 1595 return NVME_INTERNAL_DEV_ERROR; 1596 } 1597 1598 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone, 1599 uint64_t slba, uint32_t nlb) 1600 { 1601 uint64_t zcap = nvme_zone_wr_boundary(zone); 1602 uint16_t status; 1603 1604 status = nvme_check_zone_state_for_write(zone); 1605 if (status) { 1606 return status; 1607 } 1608 1609 if (unlikely(slba != zone->w_ptr)) { 1610 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr); 1611 return NVME_ZONE_INVALID_WRITE; 1612 } 1613 1614 if (unlikely((slba + nlb) > zcap)) { 1615 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap); 1616 return NVME_ZONE_BOUNDARY_ERROR; 1617 } 1618 1619 return NVME_SUCCESS; 1620 } 1621 1622 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) 1623 { 1624 switch (nvme_get_zone_state(zone)) { 1625 case NVME_ZONE_STATE_EMPTY: 1626 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1627 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1628 case NVME_ZONE_STATE_FULL: 1629 case NVME_ZONE_STATE_CLOSED: 1630 case NVME_ZONE_STATE_READ_ONLY: 1631 return NVME_SUCCESS; 1632 case NVME_ZONE_STATE_OFFLINE: 1633 trace_pci_nvme_err_zone_is_offline(zone->d.zslba); 1634 return NVME_ZONE_OFFLINE; 1635 default: 1636 assert(false); 1637 } 1638 1639 return NVME_INTERNAL_DEV_ERROR; 1640 } 1641 1642 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, 1643 uint32_t nlb) 1644 { 1645 NvmeZone *zone; 1646 uint64_t bndry, end; 1647 uint16_t status; 1648 1649 zone = nvme_get_zone_by_slba(ns, slba); 1650 assert(zone); 1651 1652 bndry = nvme_zone_rd_boundary(ns, zone); 1653 end = slba + nlb; 1654 1655 status = nvme_check_zone_state_for_read(zone); 1656 if (status) { 1657 ; 1658 } else if (unlikely(end > bndry)) { 1659 if (!ns->params.cross_zone_read) { 1660 status = NVME_ZONE_BOUNDARY_ERROR; 1661 } else { 1662 /* 1663 * Read across zone boundary - check that all subsequent 1664 * zones that are being read have an appropriate state. 1665 */ 1666 do { 1667 zone++; 1668 status = nvme_check_zone_state_for_read(zone); 1669 if (status) { 1670 break; 1671 } 1672 } while (end > nvme_zone_rd_boundary(ns, zone)); 1673 } 1674 } 1675 1676 return status; 1677 } 1678 1679 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone) 1680 { 1681 switch (nvme_get_zone_state(zone)) { 1682 case NVME_ZONE_STATE_FULL: 1683 return NVME_SUCCESS; 1684 1685 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1686 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1687 nvme_aor_dec_open(ns); 1688 /* fallthrough */ 1689 case NVME_ZONE_STATE_CLOSED: 1690 nvme_aor_dec_active(ns); 1691 /* fallthrough */ 1692 case NVME_ZONE_STATE_EMPTY: 1693 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); 1694 return NVME_SUCCESS; 1695 1696 default: 1697 return NVME_ZONE_INVAL_TRANSITION; 1698 } 1699 } 1700 1701 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone) 1702 { 1703 switch (nvme_get_zone_state(zone)) { 1704 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1705 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1706 nvme_aor_dec_open(ns); 1707 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); 1708 /* fall through */ 1709 case NVME_ZONE_STATE_CLOSED: 1710 return NVME_SUCCESS; 1711 1712 default: 1713 return NVME_ZONE_INVAL_TRANSITION; 1714 } 1715 } 1716 1717 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone) 1718 { 1719 switch (nvme_get_zone_state(zone)) { 1720 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1721 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1722 nvme_aor_dec_open(ns); 1723 /* fallthrough */ 1724 case NVME_ZONE_STATE_CLOSED: 1725 nvme_aor_dec_active(ns); 1726 /* fallthrough */ 1727 case NVME_ZONE_STATE_FULL: 1728 zone->w_ptr = zone->d.zslba; 1729 zone->d.wp = zone->w_ptr; 1730 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY); 1731 /* fallthrough */ 1732 case NVME_ZONE_STATE_EMPTY: 1733 return NVME_SUCCESS; 1734 1735 default: 1736 return NVME_ZONE_INVAL_TRANSITION; 1737 } 1738 } 1739 1740 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns) 1741 { 1742 NvmeZone *zone; 1743 1744 if (ns->params.max_open_zones && 1745 ns->nr_open_zones == ns->params.max_open_zones) { 1746 zone = QTAILQ_FIRST(&ns->imp_open_zones); 1747 if (zone) { 1748 /* 1749 * Automatically close this implicitly open zone. 1750 */ 1751 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); 1752 nvme_zrm_close(ns, zone); 1753 } 1754 } 1755 } 1756 1757 enum { 1758 NVME_ZRM_AUTO = 1 << 0, 1759 }; 1760 1761 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns, 1762 NvmeZone *zone, int flags) 1763 { 1764 int act = 0; 1765 uint16_t status; 1766 1767 switch (nvme_get_zone_state(zone)) { 1768 case NVME_ZONE_STATE_EMPTY: 1769 act = 1; 1770 1771 /* fallthrough */ 1772 1773 case NVME_ZONE_STATE_CLOSED: 1774 if (n->params.auto_transition_zones) { 1775 nvme_zrm_auto_transition_zone(ns); 1776 } 1777 status = nvme_aor_check(ns, act, 1); 1778 if (status) { 1779 return status; 1780 } 1781 1782 if (act) { 1783 nvme_aor_inc_active(ns); 1784 } 1785 1786 nvme_aor_inc_open(ns); 1787 1788 if (flags & NVME_ZRM_AUTO) { 1789 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN); 1790 return NVME_SUCCESS; 1791 } 1792 1793 /* fallthrough */ 1794 1795 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 1796 if (flags & NVME_ZRM_AUTO) { 1797 return NVME_SUCCESS; 1798 } 1799 1800 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN); 1801 1802 /* fallthrough */ 1803 1804 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 1805 return NVME_SUCCESS; 1806 1807 default: 1808 return NVME_ZONE_INVAL_TRANSITION; 1809 } 1810 } 1811 1812 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns, 1813 NvmeZone *zone) 1814 { 1815 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO); 1816 } 1817 1818 static inline uint16_t nvme_zrm_open(NvmeCtrl *n, NvmeNamespace *ns, 1819 NvmeZone *zone) 1820 { 1821 return nvme_zrm_open_flags(n, ns, zone, 0); 1822 } 1823 1824 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, 1825 uint32_t nlb) 1826 { 1827 zone->d.wp += nlb; 1828 1829 if (zone->d.wp == nvme_zone_wr_boundary(zone)) { 1830 nvme_zrm_finish(ns, zone); 1831 } 1832 } 1833 1834 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req) 1835 { 1836 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1837 NvmeZone *zone; 1838 uint64_t slba; 1839 uint32_t nlb; 1840 1841 slba = le64_to_cpu(rw->slba); 1842 nlb = le16_to_cpu(rw->nlb) + 1; 1843 zone = nvme_get_zone_by_slba(ns, slba); 1844 assert(zone); 1845 1846 nvme_advance_zone_wp(ns, zone, nlb); 1847 } 1848 1849 static inline bool nvme_is_write(NvmeRequest *req) 1850 { 1851 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1852 1853 return rw->opcode == NVME_CMD_WRITE || 1854 rw->opcode == NVME_CMD_ZONE_APPEND || 1855 rw->opcode == NVME_CMD_WRITE_ZEROES; 1856 } 1857 1858 static AioContext *nvme_get_aio_context(BlockAIOCB *acb) 1859 { 1860 return qemu_get_aio_context(); 1861 } 1862 1863 static void nvme_misc_cb(void *opaque, int ret) 1864 { 1865 NvmeRequest *req = opaque; 1866 1867 trace_pci_nvme_misc_cb(nvme_cid(req)); 1868 1869 if (ret) { 1870 nvme_aio_err(req, ret); 1871 } 1872 1873 nvme_enqueue_req_completion(nvme_cq(req), req); 1874 } 1875 1876 void nvme_rw_complete_cb(void *opaque, int ret) 1877 { 1878 NvmeRequest *req = opaque; 1879 NvmeNamespace *ns = req->ns; 1880 BlockBackend *blk = ns->blkconf.blk; 1881 BlockAcctCookie *acct = &req->acct; 1882 BlockAcctStats *stats = blk_get_stats(blk); 1883 1884 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk)); 1885 1886 if (ret) { 1887 block_acct_failed(stats, acct); 1888 nvme_aio_err(req, ret); 1889 } else { 1890 block_acct_done(stats, acct); 1891 } 1892 1893 if (ns->params.zoned && nvme_is_write(req)) { 1894 nvme_finalize_zoned_write(ns, req); 1895 } 1896 1897 nvme_enqueue_req_completion(nvme_cq(req), req); 1898 } 1899 1900 static void nvme_rw_cb(void *opaque, int ret) 1901 { 1902 NvmeRequest *req = opaque; 1903 NvmeNamespace *ns = req->ns; 1904 1905 BlockBackend *blk = ns->blkconf.blk; 1906 1907 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk)); 1908 1909 if (ret) { 1910 goto out; 1911 } 1912 1913 if (ns->lbaf.ms) { 1914 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1915 uint64_t slba = le64_to_cpu(rw->slba); 1916 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; 1917 uint64_t offset = nvme_moff(ns, slba); 1918 1919 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) { 1920 size_t mlen = nvme_m2b(ns, nlb); 1921 1922 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen, 1923 BDRV_REQ_MAY_UNMAP, 1924 nvme_rw_complete_cb, req); 1925 return; 1926 } 1927 1928 if (nvme_ns_ext(ns) || req->cmd.mptr) { 1929 uint16_t status; 1930 1931 nvme_sg_unmap(&req->sg); 1932 status = nvme_map_mdata(nvme_ctrl(req), nlb, req); 1933 if (status) { 1934 ret = -EFAULT; 1935 goto out; 1936 } 1937 1938 if (req->cmd.opcode == NVME_CMD_READ) { 1939 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req); 1940 } 1941 1942 return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req); 1943 } 1944 } 1945 1946 out: 1947 nvme_rw_complete_cb(req, ret); 1948 } 1949 1950 static void nvme_verify_cb(void *opaque, int ret) 1951 { 1952 NvmeBounceContext *ctx = opaque; 1953 NvmeRequest *req = ctx->req; 1954 NvmeNamespace *ns = req->ns; 1955 BlockBackend *blk = ns->blkconf.blk; 1956 BlockAcctCookie *acct = &req->acct; 1957 BlockAcctStats *stats = blk_get_stats(blk); 1958 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 1959 uint64_t slba = le64_to_cpu(rw->slba); 1960 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); 1961 uint16_t apptag = le16_to_cpu(rw->apptag); 1962 uint16_t appmask = le16_to_cpu(rw->appmask); 1963 uint32_t reftag = le32_to_cpu(rw->reftag); 1964 uint16_t status; 1965 1966 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag); 1967 1968 if (ret) { 1969 block_acct_failed(stats, acct); 1970 nvme_aio_err(req, ret); 1971 goto out; 1972 } 1973 1974 block_acct_done(stats, acct); 1975 1976 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 1977 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce, 1978 ctx->mdata.iov.size, slba); 1979 if (status) { 1980 req->status = status; 1981 goto out; 1982 } 1983 1984 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size, 1985 ctx->mdata.bounce, ctx->mdata.iov.size, 1986 prinfo, slba, apptag, appmask, &reftag); 1987 } 1988 1989 out: 1990 qemu_iovec_destroy(&ctx->data.iov); 1991 g_free(ctx->data.bounce); 1992 1993 qemu_iovec_destroy(&ctx->mdata.iov); 1994 g_free(ctx->mdata.bounce); 1995 1996 g_free(ctx); 1997 1998 nvme_enqueue_req_completion(nvme_cq(req), req); 1999 } 2000 2001 2002 static void nvme_verify_mdata_in_cb(void *opaque, int ret) 2003 { 2004 NvmeBounceContext *ctx = opaque; 2005 NvmeRequest *req = ctx->req; 2006 NvmeNamespace *ns = req->ns; 2007 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2008 uint64_t slba = le64_to_cpu(rw->slba); 2009 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2010 size_t mlen = nvme_m2b(ns, nlb); 2011 uint64_t offset = nvme_moff(ns, slba); 2012 BlockBackend *blk = ns->blkconf.blk; 2013 2014 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk)); 2015 2016 if (ret) { 2017 goto out; 2018 } 2019 2020 ctx->mdata.bounce = g_malloc(mlen); 2021 2022 qemu_iovec_reset(&ctx->mdata.iov); 2023 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen); 2024 2025 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0, 2026 nvme_verify_cb, ctx); 2027 return; 2028 2029 out: 2030 nvme_verify_cb(ctx, ret); 2031 } 2032 2033 struct nvme_compare_ctx { 2034 struct { 2035 QEMUIOVector iov; 2036 uint8_t *bounce; 2037 } data; 2038 2039 struct { 2040 QEMUIOVector iov; 2041 uint8_t *bounce; 2042 } mdata; 2043 }; 2044 2045 static void nvme_compare_mdata_cb(void *opaque, int ret) 2046 { 2047 NvmeRequest *req = opaque; 2048 NvmeNamespace *ns = req->ns; 2049 NvmeCtrl *n = nvme_ctrl(req); 2050 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2051 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); 2052 uint16_t apptag = le16_to_cpu(rw->apptag); 2053 uint16_t appmask = le16_to_cpu(rw->appmask); 2054 uint32_t reftag = le32_to_cpu(rw->reftag); 2055 struct nvme_compare_ctx *ctx = req->opaque; 2056 g_autofree uint8_t *buf = NULL; 2057 BlockBackend *blk = ns->blkconf.blk; 2058 BlockAcctCookie *acct = &req->acct; 2059 BlockAcctStats *stats = blk_get_stats(blk); 2060 uint16_t status = NVME_SUCCESS; 2061 2062 trace_pci_nvme_compare_mdata_cb(nvme_cid(req)); 2063 2064 if (ret) { 2065 block_acct_failed(stats, acct); 2066 nvme_aio_err(req, ret); 2067 goto out; 2068 } 2069 2070 buf = g_malloc(ctx->mdata.iov.size); 2071 2072 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size, 2073 NVME_TX_DIRECTION_TO_DEVICE, req); 2074 if (status) { 2075 req->status = status; 2076 goto out; 2077 } 2078 2079 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2080 uint64_t slba = le64_to_cpu(rw->slba); 2081 uint8_t *bufp; 2082 uint8_t *mbufp = ctx->mdata.bounce; 2083 uint8_t *end = mbufp + ctx->mdata.iov.size; 2084 int16_t pil = 0; 2085 2086 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size, 2087 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo, 2088 slba, apptag, appmask, &reftag); 2089 if (status) { 2090 req->status = status; 2091 goto out; 2092 } 2093 2094 /* 2095 * When formatted with protection information, do not compare the DIF 2096 * tuple. 2097 */ 2098 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) { 2099 pil = ns->lbaf.ms - sizeof(NvmeDifTuple); 2100 } 2101 2102 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) { 2103 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) { 2104 req->status = NVME_CMP_FAILURE; 2105 goto out; 2106 } 2107 } 2108 2109 goto out; 2110 } 2111 2112 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) { 2113 req->status = NVME_CMP_FAILURE; 2114 goto out; 2115 } 2116 2117 block_acct_done(stats, acct); 2118 2119 out: 2120 qemu_iovec_destroy(&ctx->data.iov); 2121 g_free(ctx->data.bounce); 2122 2123 qemu_iovec_destroy(&ctx->mdata.iov); 2124 g_free(ctx->mdata.bounce); 2125 2126 g_free(ctx); 2127 2128 nvme_enqueue_req_completion(nvme_cq(req), req); 2129 } 2130 2131 static void nvme_compare_data_cb(void *opaque, int ret) 2132 { 2133 NvmeRequest *req = opaque; 2134 NvmeCtrl *n = nvme_ctrl(req); 2135 NvmeNamespace *ns = req->ns; 2136 BlockBackend *blk = ns->blkconf.blk; 2137 BlockAcctCookie *acct = &req->acct; 2138 BlockAcctStats *stats = blk_get_stats(blk); 2139 2140 struct nvme_compare_ctx *ctx = req->opaque; 2141 g_autofree uint8_t *buf = NULL; 2142 uint16_t status; 2143 2144 trace_pci_nvme_compare_data_cb(nvme_cid(req)); 2145 2146 if (ret) { 2147 block_acct_failed(stats, acct); 2148 nvme_aio_err(req, ret); 2149 goto out; 2150 } 2151 2152 buf = g_malloc(ctx->data.iov.size); 2153 2154 status = nvme_bounce_data(n, buf, ctx->data.iov.size, 2155 NVME_TX_DIRECTION_TO_DEVICE, req); 2156 if (status) { 2157 req->status = status; 2158 goto out; 2159 } 2160 2161 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) { 2162 req->status = NVME_CMP_FAILURE; 2163 goto out; 2164 } 2165 2166 if (ns->lbaf.ms) { 2167 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2168 uint64_t slba = le64_to_cpu(rw->slba); 2169 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2170 size_t mlen = nvme_m2b(ns, nlb); 2171 uint64_t offset = nvme_moff(ns, slba); 2172 2173 ctx->mdata.bounce = g_malloc(mlen); 2174 2175 qemu_iovec_init(&ctx->mdata.iov, 1); 2176 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen); 2177 2178 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0, 2179 nvme_compare_mdata_cb, req); 2180 return; 2181 } 2182 2183 block_acct_done(stats, acct); 2184 2185 out: 2186 qemu_iovec_destroy(&ctx->data.iov); 2187 g_free(ctx->data.bounce); 2188 g_free(ctx); 2189 2190 nvme_enqueue_req_completion(nvme_cq(req), req); 2191 } 2192 2193 typedef struct NvmeDSMAIOCB { 2194 BlockAIOCB common; 2195 BlockAIOCB *aiocb; 2196 NvmeRequest *req; 2197 QEMUBH *bh; 2198 int ret; 2199 2200 NvmeDsmRange *range; 2201 unsigned int nr; 2202 unsigned int idx; 2203 } NvmeDSMAIOCB; 2204 2205 static void nvme_dsm_cancel(BlockAIOCB *aiocb) 2206 { 2207 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common); 2208 2209 /* break nvme_dsm_cb loop */ 2210 iocb->idx = iocb->nr; 2211 iocb->ret = -ECANCELED; 2212 2213 if (iocb->aiocb) { 2214 blk_aio_cancel_async(iocb->aiocb); 2215 iocb->aiocb = NULL; 2216 } else { 2217 /* 2218 * We only reach this if nvme_dsm_cancel() has already been called or 2219 * the command ran to completion and nvme_dsm_bh is scheduled to run. 2220 */ 2221 assert(iocb->idx == iocb->nr); 2222 } 2223 } 2224 2225 static const AIOCBInfo nvme_dsm_aiocb_info = { 2226 .aiocb_size = sizeof(NvmeDSMAIOCB), 2227 .cancel_async = nvme_dsm_cancel, 2228 }; 2229 2230 static void nvme_dsm_bh(void *opaque) 2231 { 2232 NvmeDSMAIOCB *iocb = opaque; 2233 2234 iocb->common.cb(iocb->common.opaque, iocb->ret); 2235 2236 qemu_bh_delete(iocb->bh); 2237 iocb->bh = NULL; 2238 qemu_aio_unref(iocb); 2239 } 2240 2241 static void nvme_dsm_cb(void *opaque, int ret); 2242 2243 static void nvme_dsm_md_cb(void *opaque, int ret) 2244 { 2245 NvmeDSMAIOCB *iocb = opaque; 2246 NvmeRequest *req = iocb->req; 2247 NvmeNamespace *ns = req->ns; 2248 NvmeDsmRange *range; 2249 uint64_t slba; 2250 uint32_t nlb; 2251 2252 if (ret < 0) { 2253 iocb->ret = ret; 2254 goto done; 2255 } 2256 2257 if (!ns->lbaf.ms) { 2258 nvme_dsm_cb(iocb, 0); 2259 return; 2260 } 2261 2262 range = &iocb->range[iocb->idx - 1]; 2263 slba = le64_to_cpu(range->slba); 2264 nlb = le32_to_cpu(range->nlb); 2265 2266 /* 2267 * Check that all block were discarded (zeroed); otherwise we do not zero 2268 * the metadata. 2269 */ 2270 2271 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO); 2272 if (ret) { 2273 if (ret < 0) { 2274 iocb->ret = ret; 2275 goto done; 2276 } 2277 2278 nvme_dsm_cb(iocb, 0); 2279 } 2280 2281 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba), 2282 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP, 2283 nvme_dsm_cb, iocb); 2284 return; 2285 2286 done: 2287 iocb->aiocb = NULL; 2288 qemu_bh_schedule(iocb->bh); 2289 } 2290 2291 static void nvme_dsm_cb(void *opaque, int ret) 2292 { 2293 NvmeDSMAIOCB *iocb = opaque; 2294 NvmeRequest *req = iocb->req; 2295 NvmeCtrl *n = nvme_ctrl(req); 2296 NvmeNamespace *ns = req->ns; 2297 NvmeDsmRange *range; 2298 uint64_t slba; 2299 uint32_t nlb; 2300 2301 if (ret < 0) { 2302 iocb->ret = ret; 2303 goto done; 2304 } 2305 2306 next: 2307 if (iocb->idx == iocb->nr) { 2308 goto done; 2309 } 2310 2311 range = &iocb->range[iocb->idx++]; 2312 slba = le64_to_cpu(range->slba); 2313 nlb = le32_to_cpu(range->nlb); 2314 2315 trace_pci_nvme_dsm_deallocate(slba, nlb); 2316 2317 if (nlb > n->dmrsl) { 2318 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl); 2319 goto next; 2320 } 2321 2322 if (nvme_check_bounds(ns, slba, nlb)) { 2323 trace_pci_nvme_err_invalid_lba_range(slba, nlb, 2324 ns->id_ns.nsze); 2325 goto next; 2326 } 2327 2328 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba), 2329 nvme_l2b(ns, nlb), 2330 nvme_dsm_md_cb, iocb); 2331 return; 2332 2333 done: 2334 iocb->aiocb = NULL; 2335 qemu_bh_schedule(iocb->bh); 2336 } 2337 2338 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) 2339 { 2340 NvmeNamespace *ns = req->ns; 2341 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; 2342 uint32_t attr = le32_to_cpu(dsm->attributes); 2343 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; 2344 uint16_t status = NVME_SUCCESS; 2345 2346 trace_pci_nvme_dsm(nr, attr); 2347 2348 if (attr & NVME_DSMGMT_AD) { 2349 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk, 2350 nvme_misc_cb, req); 2351 2352 iocb->req = req; 2353 iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb); 2354 iocb->ret = 0; 2355 iocb->range = g_new(NvmeDsmRange, nr); 2356 iocb->nr = nr; 2357 iocb->idx = 0; 2358 2359 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr, 2360 req); 2361 if (status) { 2362 return status; 2363 } 2364 2365 req->aiocb = &iocb->common; 2366 nvme_dsm_cb(iocb, 0); 2367 2368 return NVME_NO_COMPLETE; 2369 } 2370 2371 return status; 2372 } 2373 2374 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req) 2375 { 2376 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2377 NvmeNamespace *ns = req->ns; 2378 BlockBackend *blk = ns->blkconf.blk; 2379 uint64_t slba = le64_to_cpu(rw->slba); 2380 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2381 size_t len = nvme_l2b(ns, nlb); 2382 int64_t offset = nvme_l2b(ns, slba); 2383 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); 2384 uint32_t reftag = le32_to_cpu(rw->reftag); 2385 NvmeBounceContext *ctx = NULL; 2386 uint16_t status; 2387 2388 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb); 2389 2390 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2391 status = nvme_check_prinfo(ns, prinfo, slba, reftag); 2392 if (status) { 2393 return status; 2394 } 2395 2396 if (prinfo & NVME_PRINFO_PRACT) { 2397 return NVME_INVALID_PROT_INFO | NVME_DNR; 2398 } 2399 } 2400 2401 if (len > n->page_size << n->params.vsl) { 2402 return NVME_INVALID_FIELD | NVME_DNR; 2403 } 2404 2405 status = nvme_check_bounds(ns, slba, nlb); 2406 if (status) { 2407 return status; 2408 } 2409 2410 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 2411 status = nvme_check_dulbe(ns, slba, nlb); 2412 if (status) { 2413 return status; 2414 } 2415 } 2416 2417 ctx = g_new0(NvmeBounceContext, 1); 2418 ctx->req = req; 2419 2420 ctx->data.bounce = g_malloc(len); 2421 2422 qemu_iovec_init(&ctx->data.iov, 1); 2423 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len); 2424 2425 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size, 2426 BLOCK_ACCT_READ); 2427 2428 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0, 2429 nvme_verify_mdata_in_cb, ctx); 2430 return NVME_NO_COMPLETE; 2431 } 2432 2433 typedef struct NvmeCopyAIOCB { 2434 BlockAIOCB common; 2435 BlockAIOCB *aiocb; 2436 NvmeRequest *req; 2437 QEMUBH *bh; 2438 int ret; 2439 2440 NvmeCopySourceRange *ranges; 2441 int nr; 2442 int idx; 2443 2444 uint8_t *bounce; 2445 QEMUIOVector iov; 2446 struct { 2447 BlockAcctCookie read; 2448 BlockAcctCookie write; 2449 } acct; 2450 2451 uint32_t reftag; 2452 uint64_t slba; 2453 2454 NvmeZone *zone; 2455 } NvmeCopyAIOCB; 2456 2457 static void nvme_copy_cancel(BlockAIOCB *aiocb) 2458 { 2459 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common); 2460 2461 iocb->ret = -ECANCELED; 2462 2463 if (iocb->aiocb) { 2464 blk_aio_cancel_async(iocb->aiocb); 2465 iocb->aiocb = NULL; 2466 } 2467 } 2468 2469 static const AIOCBInfo nvme_copy_aiocb_info = { 2470 .aiocb_size = sizeof(NvmeCopyAIOCB), 2471 .cancel_async = nvme_copy_cancel, 2472 }; 2473 2474 static void nvme_copy_bh(void *opaque) 2475 { 2476 NvmeCopyAIOCB *iocb = opaque; 2477 NvmeRequest *req = iocb->req; 2478 NvmeNamespace *ns = req->ns; 2479 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk); 2480 2481 if (iocb->idx != iocb->nr) { 2482 req->cqe.result = cpu_to_le32(iocb->idx); 2483 } 2484 2485 qemu_iovec_destroy(&iocb->iov); 2486 g_free(iocb->bounce); 2487 2488 qemu_bh_delete(iocb->bh); 2489 iocb->bh = NULL; 2490 2491 if (iocb->ret < 0) { 2492 block_acct_failed(stats, &iocb->acct.read); 2493 block_acct_failed(stats, &iocb->acct.write); 2494 } else { 2495 block_acct_done(stats, &iocb->acct.read); 2496 block_acct_done(stats, &iocb->acct.write); 2497 } 2498 2499 iocb->common.cb(iocb->common.opaque, iocb->ret); 2500 qemu_aio_unref(iocb); 2501 } 2502 2503 static void nvme_copy_cb(void *opaque, int ret); 2504 2505 static void nvme_copy_out_completed_cb(void *opaque, int ret) 2506 { 2507 NvmeCopyAIOCB *iocb = opaque; 2508 NvmeRequest *req = iocb->req; 2509 NvmeNamespace *ns = req->ns; 2510 NvmeCopySourceRange *range = &iocb->ranges[iocb->idx]; 2511 uint32_t nlb = le32_to_cpu(range->nlb) + 1; 2512 2513 if (ret < 0) { 2514 iocb->ret = ret; 2515 goto out; 2516 } else if (iocb->ret < 0) { 2517 goto out; 2518 } 2519 2520 if (ns->params.zoned) { 2521 nvme_advance_zone_wp(ns, iocb->zone, nlb); 2522 } 2523 2524 iocb->idx++; 2525 iocb->slba += nlb; 2526 out: 2527 nvme_copy_cb(iocb, iocb->ret); 2528 } 2529 2530 static void nvme_copy_out_cb(void *opaque, int ret) 2531 { 2532 NvmeCopyAIOCB *iocb = opaque; 2533 NvmeRequest *req = iocb->req; 2534 NvmeNamespace *ns = req->ns; 2535 NvmeCopySourceRange *range; 2536 uint32_t nlb; 2537 size_t mlen; 2538 uint8_t *mbounce; 2539 2540 if (ret < 0) { 2541 iocb->ret = ret; 2542 goto out; 2543 } else if (iocb->ret < 0) { 2544 goto out; 2545 } 2546 2547 if (!ns->lbaf.ms) { 2548 nvme_copy_out_completed_cb(iocb, 0); 2549 return; 2550 } 2551 2552 range = &iocb->ranges[iocb->idx]; 2553 nlb = le32_to_cpu(range->nlb) + 1; 2554 2555 mlen = nvme_m2b(ns, nlb); 2556 mbounce = iocb->bounce + nvme_l2b(ns, nlb); 2557 2558 qemu_iovec_reset(&iocb->iov); 2559 qemu_iovec_add(&iocb->iov, mbounce, mlen); 2560 2561 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba), 2562 &iocb->iov, 0, nvme_copy_out_completed_cb, 2563 iocb); 2564 2565 return; 2566 2567 out: 2568 nvme_copy_cb(iocb, ret); 2569 } 2570 2571 static void nvme_copy_in_completed_cb(void *opaque, int ret) 2572 { 2573 NvmeCopyAIOCB *iocb = opaque; 2574 NvmeRequest *req = iocb->req; 2575 NvmeNamespace *ns = req->ns; 2576 NvmeCopySourceRange *range; 2577 uint32_t nlb; 2578 size_t len; 2579 uint16_t status; 2580 2581 if (ret < 0) { 2582 iocb->ret = ret; 2583 goto out; 2584 } else if (iocb->ret < 0) { 2585 goto out; 2586 } 2587 2588 range = &iocb->ranges[iocb->idx]; 2589 nlb = le32_to_cpu(range->nlb) + 1; 2590 len = nvme_l2b(ns, nlb); 2591 2592 trace_pci_nvme_copy_out(iocb->slba, nlb); 2593 2594 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 2595 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; 2596 2597 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf); 2598 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf); 2599 2600 uint16_t apptag = le16_to_cpu(range->apptag); 2601 uint16_t appmask = le16_to_cpu(range->appmask); 2602 uint32_t reftag = le32_to_cpu(range->reftag); 2603 2604 uint64_t slba = le64_to_cpu(range->slba); 2605 size_t mlen = nvme_m2b(ns, nlb); 2606 uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb); 2607 2608 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor, 2609 slba, apptag, appmask, &reftag); 2610 if (status) { 2611 goto invalid; 2612 } 2613 2614 apptag = le16_to_cpu(copy->apptag); 2615 appmask = le16_to_cpu(copy->appmask); 2616 2617 if (prinfow & NVME_PRINFO_PRACT) { 2618 status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag); 2619 if (status) { 2620 goto invalid; 2621 } 2622 2623 nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen, 2624 apptag, &iocb->reftag); 2625 } else { 2626 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, 2627 prinfow, iocb->slba, apptag, appmask, 2628 &iocb->reftag); 2629 if (status) { 2630 goto invalid; 2631 } 2632 } 2633 } 2634 2635 status = nvme_check_bounds(ns, iocb->slba, nlb); 2636 if (status) { 2637 goto invalid; 2638 } 2639 2640 if (ns->params.zoned) { 2641 status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb); 2642 if (status) { 2643 goto invalid; 2644 } 2645 2646 iocb->zone->w_ptr += nlb; 2647 } 2648 2649 qemu_iovec_reset(&iocb->iov); 2650 qemu_iovec_add(&iocb->iov, iocb->bounce, len); 2651 2652 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba), 2653 &iocb->iov, 0, nvme_copy_out_cb, iocb); 2654 2655 return; 2656 2657 invalid: 2658 req->status = status; 2659 iocb->aiocb = NULL; 2660 if (iocb->bh) { 2661 qemu_bh_schedule(iocb->bh); 2662 } 2663 2664 return; 2665 2666 out: 2667 nvme_copy_cb(iocb, ret); 2668 } 2669 2670 static void nvme_copy_in_cb(void *opaque, int ret) 2671 { 2672 NvmeCopyAIOCB *iocb = opaque; 2673 NvmeRequest *req = iocb->req; 2674 NvmeNamespace *ns = req->ns; 2675 NvmeCopySourceRange *range; 2676 uint64_t slba; 2677 uint32_t nlb; 2678 2679 if (ret < 0) { 2680 iocb->ret = ret; 2681 goto out; 2682 } else if (iocb->ret < 0) { 2683 goto out; 2684 } 2685 2686 if (!ns->lbaf.ms) { 2687 nvme_copy_in_completed_cb(iocb, 0); 2688 return; 2689 } 2690 2691 range = &iocb->ranges[iocb->idx]; 2692 slba = le64_to_cpu(range->slba); 2693 nlb = le32_to_cpu(range->nlb) + 1; 2694 2695 qemu_iovec_reset(&iocb->iov); 2696 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb), 2697 nvme_m2b(ns, nlb)); 2698 2699 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba), 2700 &iocb->iov, 0, nvme_copy_in_completed_cb, 2701 iocb); 2702 return; 2703 2704 out: 2705 nvme_copy_cb(iocb, iocb->ret); 2706 } 2707 2708 static void nvme_copy_cb(void *opaque, int ret) 2709 { 2710 NvmeCopyAIOCB *iocb = opaque; 2711 NvmeRequest *req = iocb->req; 2712 NvmeNamespace *ns = req->ns; 2713 NvmeCopySourceRange *range; 2714 uint64_t slba; 2715 uint32_t nlb; 2716 size_t len; 2717 uint16_t status; 2718 2719 if (ret < 0) { 2720 iocb->ret = ret; 2721 goto done; 2722 } else if (iocb->ret < 0) { 2723 goto done; 2724 } 2725 2726 if (iocb->idx == iocb->nr) { 2727 goto done; 2728 } 2729 2730 range = &iocb->ranges[iocb->idx]; 2731 slba = le64_to_cpu(range->slba); 2732 nlb = le32_to_cpu(range->nlb) + 1; 2733 len = nvme_l2b(ns, nlb); 2734 2735 trace_pci_nvme_copy_source_range(slba, nlb); 2736 2737 if (nlb > le16_to_cpu(ns->id_ns.mssrl)) { 2738 status = NVME_CMD_SIZE_LIMIT | NVME_DNR; 2739 goto invalid; 2740 } 2741 2742 status = nvme_check_bounds(ns, slba, nlb); 2743 if (status) { 2744 goto invalid; 2745 } 2746 2747 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 2748 status = nvme_check_dulbe(ns, slba, nlb); 2749 if (status) { 2750 goto invalid; 2751 } 2752 } 2753 2754 if (ns->params.zoned) { 2755 status = nvme_check_zone_read(ns, slba, nlb); 2756 if (status) { 2757 goto invalid; 2758 } 2759 } 2760 2761 qemu_iovec_reset(&iocb->iov); 2762 qemu_iovec_add(&iocb->iov, iocb->bounce, len); 2763 2764 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba), 2765 &iocb->iov, 0, nvme_copy_in_cb, iocb); 2766 return; 2767 2768 invalid: 2769 req->status = status; 2770 done: 2771 iocb->aiocb = NULL; 2772 if (iocb->bh) { 2773 qemu_bh_schedule(iocb->bh); 2774 } 2775 } 2776 2777 2778 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req) 2779 { 2780 NvmeNamespace *ns = req->ns; 2781 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; 2782 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk, 2783 nvme_misc_cb, req); 2784 uint16_t nr = copy->nr + 1; 2785 uint8_t format = copy->control[0] & 0xf; 2786 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf); 2787 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf); 2788 2789 uint16_t status; 2790 2791 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format); 2792 2793 iocb->ranges = NULL; 2794 iocb->zone = NULL; 2795 2796 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && 2797 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) { 2798 status = NVME_INVALID_FIELD | NVME_DNR; 2799 goto invalid; 2800 } 2801 2802 if (!(n->id_ctrl.ocfs & (1 << format))) { 2803 trace_pci_nvme_err_copy_invalid_format(format); 2804 status = NVME_INVALID_FIELD | NVME_DNR; 2805 goto invalid; 2806 } 2807 2808 if (nr > ns->id_ns.msrc + 1) { 2809 status = NVME_CMD_SIZE_LIMIT | NVME_DNR; 2810 goto invalid; 2811 } 2812 2813 iocb->ranges = g_new(NvmeCopySourceRange, nr); 2814 2815 status = nvme_h2c(n, (uint8_t *)iocb->ranges, 2816 sizeof(NvmeCopySourceRange) * nr, req); 2817 if (status) { 2818 goto invalid; 2819 } 2820 2821 iocb->slba = le64_to_cpu(copy->sdlba); 2822 2823 if (ns->params.zoned) { 2824 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba); 2825 if (!iocb->zone) { 2826 status = NVME_LBA_RANGE | NVME_DNR; 2827 goto invalid; 2828 } 2829 2830 status = nvme_zrm_auto(n, ns, iocb->zone); 2831 if (status) { 2832 goto invalid; 2833 } 2834 } 2835 2836 iocb->req = req; 2837 iocb->bh = qemu_bh_new(nvme_copy_bh, iocb); 2838 iocb->ret = 0; 2839 iocb->nr = nr; 2840 iocb->idx = 0; 2841 iocb->reftag = le32_to_cpu(copy->reftag); 2842 iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl), 2843 ns->lbasz + ns->lbaf.ms); 2844 2845 qemu_iovec_init(&iocb->iov, 1); 2846 2847 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0, 2848 BLOCK_ACCT_READ); 2849 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0, 2850 BLOCK_ACCT_WRITE); 2851 2852 req->aiocb = &iocb->common; 2853 nvme_copy_cb(iocb, 0); 2854 2855 return NVME_NO_COMPLETE; 2856 2857 invalid: 2858 g_free(iocb->ranges); 2859 qemu_aio_unref(iocb); 2860 return status; 2861 } 2862 2863 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) 2864 { 2865 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 2866 NvmeNamespace *ns = req->ns; 2867 BlockBackend *blk = ns->blkconf.blk; 2868 uint64_t slba = le64_to_cpu(rw->slba); 2869 uint32_t nlb = le16_to_cpu(rw->nlb) + 1; 2870 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); 2871 size_t data_len = nvme_l2b(ns, nlb); 2872 size_t len = data_len; 2873 int64_t offset = nvme_l2b(ns, slba); 2874 struct nvme_compare_ctx *ctx = NULL; 2875 uint16_t status; 2876 2877 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb); 2878 2879 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) { 2880 return NVME_INVALID_PROT_INFO | NVME_DNR; 2881 } 2882 2883 if (nvme_ns_ext(ns)) { 2884 len += nvme_m2b(ns, nlb); 2885 } 2886 2887 status = nvme_check_mdts(n, len); 2888 if (status) { 2889 return status; 2890 } 2891 2892 status = nvme_check_bounds(ns, slba, nlb); 2893 if (status) { 2894 return status; 2895 } 2896 2897 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 2898 status = nvme_check_dulbe(ns, slba, nlb); 2899 if (status) { 2900 return status; 2901 } 2902 } 2903 2904 status = nvme_map_dptr(n, &req->sg, len, &req->cmd); 2905 if (status) { 2906 return status; 2907 } 2908 2909 ctx = g_new(struct nvme_compare_ctx, 1); 2910 ctx->data.bounce = g_malloc(data_len); 2911 2912 req->opaque = ctx; 2913 2914 qemu_iovec_init(&ctx->data.iov, 1); 2915 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len); 2916 2917 block_acct_start(blk_get_stats(blk), &req->acct, data_len, 2918 BLOCK_ACCT_READ); 2919 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0, 2920 nvme_compare_data_cb, req); 2921 2922 return NVME_NO_COMPLETE; 2923 } 2924 2925 typedef struct NvmeFlushAIOCB { 2926 BlockAIOCB common; 2927 BlockAIOCB *aiocb; 2928 NvmeRequest *req; 2929 QEMUBH *bh; 2930 int ret; 2931 2932 NvmeNamespace *ns; 2933 uint32_t nsid; 2934 bool broadcast; 2935 } NvmeFlushAIOCB; 2936 2937 static void nvme_flush_cancel(BlockAIOCB *acb) 2938 { 2939 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common); 2940 2941 iocb->ret = -ECANCELED; 2942 2943 if (iocb->aiocb) { 2944 blk_aio_cancel_async(iocb->aiocb); 2945 } 2946 } 2947 2948 static const AIOCBInfo nvme_flush_aiocb_info = { 2949 .aiocb_size = sizeof(NvmeFlushAIOCB), 2950 .cancel_async = nvme_flush_cancel, 2951 .get_aio_context = nvme_get_aio_context, 2952 }; 2953 2954 static void nvme_flush_ns_cb(void *opaque, int ret) 2955 { 2956 NvmeFlushAIOCB *iocb = opaque; 2957 NvmeNamespace *ns = iocb->ns; 2958 2959 if (ret < 0) { 2960 iocb->ret = ret; 2961 goto out; 2962 } else if (iocb->ret < 0) { 2963 goto out; 2964 } 2965 2966 if (ns) { 2967 trace_pci_nvme_flush_ns(iocb->nsid); 2968 2969 iocb->ns = NULL; 2970 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb); 2971 return; 2972 } 2973 2974 out: 2975 iocb->aiocb = NULL; 2976 qemu_bh_schedule(iocb->bh); 2977 } 2978 2979 static void nvme_flush_bh(void *opaque) 2980 { 2981 NvmeFlushAIOCB *iocb = opaque; 2982 NvmeRequest *req = iocb->req; 2983 NvmeCtrl *n = nvme_ctrl(req); 2984 int i; 2985 2986 if (iocb->ret < 0) { 2987 goto done; 2988 } 2989 2990 if (iocb->broadcast) { 2991 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) { 2992 iocb->ns = nvme_ns(n, i); 2993 if (iocb->ns) { 2994 iocb->nsid = i; 2995 break; 2996 } 2997 } 2998 } 2999 3000 if (!iocb->ns) { 3001 goto done; 3002 } 3003 3004 nvme_flush_ns_cb(iocb, 0); 3005 return; 3006 3007 done: 3008 qemu_bh_delete(iocb->bh); 3009 iocb->bh = NULL; 3010 3011 iocb->common.cb(iocb->common.opaque, iocb->ret); 3012 3013 qemu_aio_unref(iocb); 3014 3015 return; 3016 } 3017 3018 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) 3019 { 3020 NvmeFlushAIOCB *iocb; 3021 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 3022 uint16_t status; 3023 3024 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req); 3025 3026 iocb->req = req; 3027 iocb->bh = qemu_bh_new(nvme_flush_bh, iocb); 3028 iocb->ret = 0; 3029 iocb->ns = NULL; 3030 iocb->nsid = 0; 3031 iocb->broadcast = (nsid == NVME_NSID_BROADCAST); 3032 3033 if (!iocb->broadcast) { 3034 if (!nvme_nsid_valid(n, nsid)) { 3035 status = NVME_INVALID_NSID | NVME_DNR; 3036 goto out; 3037 } 3038 3039 iocb->ns = nvme_ns(n, nsid); 3040 if (!iocb->ns) { 3041 status = NVME_INVALID_FIELD | NVME_DNR; 3042 goto out; 3043 } 3044 3045 iocb->nsid = nsid; 3046 } 3047 3048 req->aiocb = &iocb->common; 3049 qemu_bh_schedule(iocb->bh); 3050 3051 return NVME_NO_COMPLETE; 3052 3053 out: 3054 qemu_bh_delete(iocb->bh); 3055 iocb->bh = NULL; 3056 qemu_aio_unref(iocb); 3057 3058 return status; 3059 } 3060 3061 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) 3062 { 3063 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 3064 NvmeNamespace *ns = req->ns; 3065 uint64_t slba = le64_to_cpu(rw->slba); 3066 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; 3067 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); 3068 uint64_t data_size = nvme_l2b(ns, nlb); 3069 uint64_t mapped_size = data_size; 3070 uint64_t data_offset; 3071 BlockBackend *blk = ns->blkconf.blk; 3072 uint16_t status; 3073 3074 if (nvme_ns_ext(ns)) { 3075 mapped_size += nvme_m2b(ns, nlb); 3076 3077 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3078 bool pract = prinfo & NVME_PRINFO_PRACT; 3079 3080 if (pract && ns->lbaf.ms == 8) { 3081 mapped_size = data_size; 3082 } 3083 } 3084 } 3085 3086 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba); 3087 3088 status = nvme_check_mdts(n, mapped_size); 3089 if (status) { 3090 goto invalid; 3091 } 3092 3093 status = nvme_check_bounds(ns, slba, nlb); 3094 if (status) { 3095 goto invalid; 3096 } 3097 3098 if (ns->params.zoned) { 3099 status = nvme_check_zone_read(ns, slba, nlb); 3100 if (status) { 3101 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status); 3102 goto invalid; 3103 } 3104 } 3105 3106 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { 3107 status = nvme_check_dulbe(ns, slba, nlb); 3108 if (status) { 3109 goto invalid; 3110 } 3111 } 3112 3113 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3114 return nvme_dif_rw(n, req); 3115 } 3116 3117 status = nvme_map_data(n, nlb, req); 3118 if (status) { 3119 goto invalid; 3120 } 3121 3122 data_offset = nvme_l2b(ns, slba); 3123 3124 block_acct_start(blk_get_stats(blk), &req->acct, data_size, 3125 BLOCK_ACCT_READ); 3126 nvme_blk_read(blk, data_offset, nvme_rw_cb, req); 3127 return NVME_NO_COMPLETE; 3128 3129 invalid: 3130 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ); 3131 return status | NVME_DNR; 3132 } 3133 3134 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, 3135 bool wrz) 3136 { 3137 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; 3138 NvmeNamespace *ns = req->ns; 3139 uint64_t slba = le64_to_cpu(rw->slba); 3140 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; 3141 uint16_t ctrl = le16_to_cpu(rw->control); 3142 uint8_t prinfo = NVME_RW_PRINFO(ctrl); 3143 uint64_t data_size = nvme_l2b(ns, nlb); 3144 uint64_t mapped_size = data_size; 3145 uint64_t data_offset; 3146 NvmeZone *zone; 3147 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe; 3148 BlockBackend *blk = ns->blkconf.blk; 3149 uint16_t status; 3150 3151 if (nvme_ns_ext(ns)) { 3152 mapped_size += nvme_m2b(ns, nlb); 3153 3154 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3155 bool pract = prinfo & NVME_PRINFO_PRACT; 3156 3157 if (pract && ns->lbaf.ms == 8) { 3158 mapped_size -= nvme_m2b(ns, nlb); 3159 } 3160 } 3161 } 3162 3163 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode), 3164 nvme_nsid(ns), nlb, mapped_size, slba); 3165 3166 if (!wrz) { 3167 status = nvme_check_mdts(n, mapped_size); 3168 if (status) { 3169 goto invalid; 3170 } 3171 } 3172 3173 status = nvme_check_bounds(ns, slba, nlb); 3174 if (status) { 3175 goto invalid; 3176 } 3177 3178 if (ns->params.zoned) { 3179 zone = nvme_get_zone_by_slba(ns, slba); 3180 assert(zone); 3181 3182 if (append) { 3183 bool piremap = !!(ctrl & NVME_RW_PIREMAP); 3184 3185 if (unlikely(slba != zone->d.zslba)) { 3186 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba); 3187 status = NVME_INVALID_FIELD; 3188 goto invalid; 3189 } 3190 3191 if (n->params.zasl && 3192 data_size > (uint64_t)n->page_size << n->params.zasl) { 3193 trace_pci_nvme_err_zasl(data_size); 3194 return NVME_INVALID_FIELD | NVME_DNR; 3195 } 3196 3197 slba = zone->w_ptr; 3198 rw->slba = cpu_to_le64(slba); 3199 res->slba = cpu_to_le64(slba); 3200 3201 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3202 case NVME_ID_NS_DPS_TYPE_1: 3203 if (!piremap) { 3204 return NVME_INVALID_PROT_INFO | NVME_DNR; 3205 } 3206 3207 /* fallthrough */ 3208 3209 case NVME_ID_NS_DPS_TYPE_2: 3210 if (piremap) { 3211 uint32_t reftag = le32_to_cpu(rw->reftag); 3212 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba)); 3213 } 3214 3215 break; 3216 3217 case NVME_ID_NS_DPS_TYPE_3: 3218 if (piremap) { 3219 return NVME_INVALID_PROT_INFO | NVME_DNR; 3220 } 3221 3222 break; 3223 } 3224 } 3225 3226 status = nvme_check_zone_write(ns, zone, slba, nlb); 3227 if (status) { 3228 goto invalid; 3229 } 3230 3231 status = nvme_zrm_auto(n, ns, zone); 3232 if (status) { 3233 goto invalid; 3234 } 3235 3236 zone->w_ptr += nlb; 3237 } 3238 3239 data_offset = nvme_l2b(ns, slba); 3240 3241 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { 3242 return nvme_dif_rw(n, req); 3243 } 3244 3245 if (!wrz) { 3246 status = nvme_map_data(n, nlb, req); 3247 if (status) { 3248 goto invalid; 3249 } 3250 3251 block_acct_start(blk_get_stats(blk), &req->acct, data_size, 3252 BLOCK_ACCT_WRITE); 3253 nvme_blk_write(blk, data_offset, nvme_rw_cb, req); 3254 } else { 3255 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size, 3256 BDRV_REQ_MAY_UNMAP, nvme_rw_cb, 3257 req); 3258 } 3259 3260 return NVME_NO_COMPLETE; 3261 3262 invalid: 3263 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE); 3264 return status | NVME_DNR; 3265 } 3266 3267 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req) 3268 { 3269 return nvme_do_write(n, req, false, false); 3270 } 3271 3272 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) 3273 { 3274 return nvme_do_write(n, req, false, true); 3275 } 3276 3277 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req) 3278 { 3279 return nvme_do_write(n, req, true, false); 3280 } 3281 3282 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c, 3283 uint64_t *slba, uint32_t *zone_idx) 3284 { 3285 uint32_t dw10 = le32_to_cpu(c->cdw10); 3286 uint32_t dw11 = le32_to_cpu(c->cdw11); 3287 3288 if (!ns->params.zoned) { 3289 trace_pci_nvme_err_invalid_opc(c->opcode); 3290 return NVME_INVALID_OPCODE | NVME_DNR; 3291 } 3292 3293 *slba = ((uint64_t)dw11) << 32 | dw10; 3294 if (unlikely(*slba >= ns->id_ns.nsze)) { 3295 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze); 3296 *slba = 0; 3297 return NVME_LBA_RANGE | NVME_DNR; 3298 } 3299 3300 *zone_idx = nvme_zone_idx(ns, *slba); 3301 assert(*zone_idx < ns->num_zones); 3302 3303 return NVME_SUCCESS; 3304 } 3305 3306 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState, 3307 NvmeRequest *); 3308 3309 enum NvmeZoneProcessingMask { 3310 NVME_PROC_CURRENT_ZONE = 0, 3311 NVME_PROC_OPENED_ZONES = 1 << 0, 3312 NVME_PROC_CLOSED_ZONES = 1 << 1, 3313 NVME_PROC_READ_ONLY_ZONES = 1 << 2, 3314 NVME_PROC_FULL_ZONES = 1 << 3, 3315 }; 3316 3317 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, 3318 NvmeZoneState state, NvmeRequest *req) 3319 { 3320 return nvme_zrm_open(nvme_ctrl(req), ns, zone); 3321 } 3322 3323 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, 3324 NvmeZoneState state, NvmeRequest *req) 3325 { 3326 return nvme_zrm_close(ns, zone); 3327 } 3328 3329 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, 3330 NvmeZoneState state, NvmeRequest *req) 3331 { 3332 return nvme_zrm_finish(ns, zone); 3333 } 3334 3335 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone, 3336 NvmeZoneState state, NvmeRequest *req) 3337 { 3338 switch (state) { 3339 case NVME_ZONE_STATE_READ_ONLY: 3340 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE); 3341 /* fall through */ 3342 case NVME_ZONE_STATE_OFFLINE: 3343 return NVME_SUCCESS; 3344 default: 3345 return NVME_ZONE_INVAL_TRANSITION; 3346 } 3347 } 3348 3349 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone) 3350 { 3351 uint16_t status; 3352 uint8_t state = nvme_get_zone_state(zone); 3353 3354 if (state == NVME_ZONE_STATE_EMPTY) { 3355 status = nvme_aor_check(ns, 1, 0); 3356 if (status) { 3357 return status; 3358 } 3359 nvme_aor_inc_active(ns); 3360 zone->d.za |= NVME_ZA_ZD_EXT_VALID; 3361 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); 3362 return NVME_SUCCESS; 3363 } 3364 3365 return NVME_ZONE_INVAL_TRANSITION; 3366 } 3367 3368 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, 3369 enum NvmeZoneProcessingMask proc_mask, 3370 op_handler_t op_hndlr, NvmeRequest *req) 3371 { 3372 uint16_t status = NVME_SUCCESS; 3373 NvmeZoneState zs = nvme_get_zone_state(zone); 3374 bool proc_zone; 3375 3376 switch (zs) { 3377 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 3378 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 3379 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES; 3380 break; 3381 case NVME_ZONE_STATE_CLOSED: 3382 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES; 3383 break; 3384 case NVME_ZONE_STATE_READ_ONLY: 3385 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES; 3386 break; 3387 case NVME_ZONE_STATE_FULL: 3388 proc_zone = proc_mask & NVME_PROC_FULL_ZONES; 3389 break; 3390 default: 3391 proc_zone = false; 3392 } 3393 3394 if (proc_zone) { 3395 status = op_hndlr(ns, zone, zs, req); 3396 } 3397 3398 return status; 3399 } 3400 3401 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone, 3402 enum NvmeZoneProcessingMask proc_mask, 3403 op_handler_t op_hndlr, NvmeRequest *req) 3404 { 3405 NvmeZone *next; 3406 uint16_t status = NVME_SUCCESS; 3407 int i; 3408 3409 if (!proc_mask) { 3410 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req); 3411 } else { 3412 if (proc_mask & NVME_PROC_CLOSED_ZONES) { 3413 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { 3414 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3415 req); 3416 if (status && status != NVME_NO_COMPLETE) { 3417 goto out; 3418 } 3419 } 3420 } 3421 if (proc_mask & NVME_PROC_OPENED_ZONES) { 3422 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { 3423 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3424 req); 3425 if (status && status != NVME_NO_COMPLETE) { 3426 goto out; 3427 } 3428 } 3429 3430 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { 3431 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3432 req); 3433 if (status && status != NVME_NO_COMPLETE) { 3434 goto out; 3435 } 3436 } 3437 } 3438 if (proc_mask & NVME_PROC_FULL_ZONES) { 3439 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) { 3440 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3441 req); 3442 if (status && status != NVME_NO_COMPLETE) { 3443 goto out; 3444 } 3445 } 3446 } 3447 3448 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) { 3449 for (i = 0; i < ns->num_zones; i++, zone++) { 3450 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, 3451 req); 3452 if (status && status != NVME_NO_COMPLETE) { 3453 goto out; 3454 } 3455 } 3456 } 3457 } 3458 3459 out: 3460 return status; 3461 } 3462 3463 typedef struct NvmeZoneResetAIOCB { 3464 BlockAIOCB common; 3465 BlockAIOCB *aiocb; 3466 NvmeRequest *req; 3467 QEMUBH *bh; 3468 int ret; 3469 3470 bool all; 3471 int idx; 3472 NvmeZone *zone; 3473 } NvmeZoneResetAIOCB; 3474 3475 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb) 3476 { 3477 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common); 3478 NvmeRequest *req = iocb->req; 3479 NvmeNamespace *ns = req->ns; 3480 3481 iocb->idx = ns->num_zones; 3482 3483 iocb->ret = -ECANCELED; 3484 3485 if (iocb->aiocb) { 3486 blk_aio_cancel_async(iocb->aiocb); 3487 iocb->aiocb = NULL; 3488 } 3489 } 3490 3491 static const AIOCBInfo nvme_zone_reset_aiocb_info = { 3492 .aiocb_size = sizeof(NvmeZoneResetAIOCB), 3493 .cancel_async = nvme_zone_reset_cancel, 3494 }; 3495 3496 static void nvme_zone_reset_bh(void *opaque) 3497 { 3498 NvmeZoneResetAIOCB *iocb = opaque; 3499 3500 iocb->common.cb(iocb->common.opaque, iocb->ret); 3501 3502 qemu_bh_delete(iocb->bh); 3503 iocb->bh = NULL; 3504 qemu_aio_unref(iocb); 3505 } 3506 3507 static void nvme_zone_reset_cb(void *opaque, int ret); 3508 3509 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret) 3510 { 3511 NvmeZoneResetAIOCB *iocb = opaque; 3512 NvmeRequest *req = iocb->req; 3513 NvmeNamespace *ns = req->ns; 3514 int64_t moff; 3515 int count; 3516 3517 if (ret < 0) { 3518 nvme_zone_reset_cb(iocb, ret); 3519 return; 3520 } 3521 3522 if (!ns->lbaf.ms) { 3523 nvme_zone_reset_cb(iocb, 0); 3524 return; 3525 } 3526 3527 moff = nvme_moff(ns, iocb->zone->d.zslba); 3528 count = nvme_m2b(ns, ns->zone_size); 3529 3530 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count, 3531 BDRV_REQ_MAY_UNMAP, 3532 nvme_zone_reset_cb, iocb); 3533 return; 3534 } 3535 3536 static void nvme_zone_reset_cb(void *opaque, int ret) 3537 { 3538 NvmeZoneResetAIOCB *iocb = opaque; 3539 NvmeRequest *req = iocb->req; 3540 NvmeNamespace *ns = req->ns; 3541 3542 if (ret < 0) { 3543 iocb->ret = ret; 3544 goto done; 3545 } 3546 3547 if (iocb->zone) { 3548 nvme_zrm_reset(ns, iocb->zone); 3549 3550 if (!iocb->all) { 3551 goto done; 3552 } 3553 } 3554 3555 while (iocb->idx < ns->num_zones) { 3556 NvmeZone *zone = &ns->zone_array[iocb->idx++]; 3557 3558 switch (nvme_get_zone_state(zone)) { 3559 case NVME_ZONE_STATE_EMPTY: 3560 if (!iocb->all) { 3561 goto done; 3562 } 3563 3564 continue; 3565 3566 case NVME_ZONE_STATE_EXPLICITLY_OPEN: 3567 case NVME_ZONE_STATE_IMPLICITLY_OPEN: 3568 case NVME_ZONE_STATE_CLOSED: 3569 case NVME_ZONE_STATE_FULL: 3570 iocb->zone = zone; 3571 break; 3572 3573 default: 3574 continue; 3575 } 3576 3577 trace_pci_nvme_zns_zone_reset(zone->d.zslba); 3578 3579 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, 3580 nvme_l2b(ns, zone->d.zslba), 3581 nvme_l2b(ns, ns->zone_size), 3582 BDRV_REQ_MAY_UNMAP, 3583 nvme_zone_reset_epilogue_cb, 3584 iocb); 3585 return; 3586 } 3587 3588 done: 3589 iocb->aiocb = NULL; 3590 if (iocb->bh) { 3591 qemu_bh_schedule(iocb->bh); 3592 } 3593 } 3594 3595 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) 3596 { 3597 NvmeCmd *cmd = (NvmeCmd *)&req->cmd; 3598 NvmeNamespace *ns = req->ns; 3599 NvmeZone *zone; 3600 NvmeZoneResetAIOCB *iocb; 3601 uint8_t *zd_ext; 3602 uint32_t dw13 = le32_to_cpu(cmd->cdw13); 3603 uint64_t slba = 0; 3604 uint32_t zone_idx = 0; 3605 uint16_t status; 3606 uint8_t action; 3607 bool all; 3608 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE; 3609 3610 action = dw13 & 0xff; 3611 all = !!(dw13 & 0x100); 3612 3613 req->status = NVME_SUCCESS; 3614 3615 if (!all) { 3616 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx); 3617 if (status) { 3618 return status; 3619 } 3620 } 3621 3622 zone = &ns->zone_array[zone_idx]; 3623 if (slba != zone->d.zslba) { 3624 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba); 3625 return NVME_INVALID_FIELD | NVME_DNR; 3626 } 3627 3628 switch (action) { 3629 3630 case NVME_ZONE_ACTION_OPEN: 3631 if (all) { 3632 proc_mask = NVME_PROC_CLOSED_ZONES; 3633 } 3634 trace_pci_nvme_open_zone(slba, zone_idx, all); 3635 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req); 3636 break; 3637 3638 case NVME_ZONE_ACTION_CLOSE: 3639 if (all) { 3640 proc_mask = NVME_PROC_OPENED_ZONES; 3641 } 3642 trace_pci_nvme_close_zone(slba, zone_idx, all); 3643 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req); 3644 break; 3645 3646 case NVME_ZONE_ACTION_FINISH: 3647 if (all) { 3648 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES; 3649 } 3650 trace_pci_nvme_finish_zone(slba, zone_idx, all); 3651 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req); 3652 break; 3653 3654 case NVME_ZONE_ACTION_RESET: 3655 trace_pci_nvme_reset_zone(slba, zone_idx, all); 3656 3657 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk, 3658 nvme_misc_cb, req); 3659 3660 iocb->req = req; 3661 iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb); 3662 iocb->ret = 0; 3663 iocb->all = all; 3664 iocb->idx = zone_idx; 3665 iocb->zone = NULL; 3666 3667 req->aiocb = &iocb->common; 3668 nvme_zone_reset_cb(iocb, 0); 3669 3670 return NVME_NO_COMPLETE; 3671 3672 case NVME_ZONE_ACTION_OFFLINE: 3673 if (all) { 3674 proc_mask = NVME_PROC_READ_ONLY_ZONES; 3675 } 3676 trace_pci_nvme_offline_zone(slba, zone_idx, all); 3677 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req); 3678 break; 3679 3680 case NVME_ZONE_ACTION_SET_ZD_EXT: 3681 trace_pci_nvme_set_descriptor_extension(slba, zone_idx); 3682 if (all || !ns->params.zd_extension_size) { 3683 return NVME_INVALID_FIELD | NVME_DNR; 3684 } 3685 zd_ext = nvme_get_zd_extension(ns, zone_idx); 3686 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req); 3687 if (status) { 3688 trace_pci_nvme_err_zd_extension_map_error(zone_idx); 3689 return status; 3690 } 3691 3692 status = nvme_set_zd_ext(ns, zone); 3693 if (status == NVME_SUCCESS) { 3694 trace_pci_nvme_zd_extension_set(zone_idx); 3695 return status; 3696 } 3697 break; 3698 3699 default: 3700 trace_pci_nvme_err_invalid_mgmt_action(action); 3701 status = NVME_INVALID_FIELD; 3702 } 3703 3704 if (status == NVME_ZONE_INVAL_TRANSITION) { 3705 trace_pci_nvme_err_invalid_zone_state_transition(action, slba, 3706 zone->d.za); 3707 } 3708 if (status) { 3709 status |= NVME_DNR; 3710 } 3711 3712 return status; 3713 } 3714 3715 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl) 3716 { 3717 NvmeZoneState zs = nvme_get_zone_state(zl); 3718 3719 switch (zafs) { 3720 case NVME_ZONE_REPORT_ALL: 3721 return true; 3722 case NVME_ZONE_REPORT_EMPTY: 3723 return zs == NVME_ZONE_STATE_EMPTY; 3724 case NVME_ZONE_REPORT_IMPLICITLY_OPEN: 3725 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN; 3726 case NVME_ZONE_REPORT_EXPLICITLY_OPEN: 3727 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN; 3728 case NVME_ZONE_REPORT_CLOSED: 3729 return zs == NVME_ZONE_STATE_CLOSED; 3730 case NVME_ZONE_REPORT_FULL: 3731 return zs == NVME_ZONE_STATE_FULL; 3732 case NVME_ZONE_REPORT_READ_ONLY: 3733 return zs == NVME_ZONE_STATE_READ_ONLY; 3734 case NVME_ZONE_REPORT_OFFLINE: 3735 return zs == NVME_ZONE_STATE_OFFLINE; 3736 default: 3737 return false; 3738 } 3739 } 3740 3741 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) 3742 { 3743 NvmeCmd *cmd = (NvmeCmd *)&req->cmd; 3744 NvmeNamespace *ns = req->ns; 3745 /* cdw12 is zero-based number of dwords to return. Convert to bytes */ 3746 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2; 3747 uint32_t dw13 = le32_to_cpu(cmd->cdw13); 3748 uint32_t zone_idx, zra, zrasf, partial; 3749 uint64_t max_zones, nr_zones = 0; 3750 uint16_t status; 3751 uint64_t slba; 3752 NvmeZoneDescr *z; 3753 NvmeZone *zone; 3754 NvmeZoneReportHeader *header; 3755 void *buf, *buf_p; 3756 size_t zone_entry_sz; 3757 int i; 3758 3759 req->status = NVME_SUCCESS; 3760 3761 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx); 3762 if (status) { 3763 return status; 3764 } 3765 3766 zra = dw13 & 0xff; 3767 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) { 3768 return NVME_INVALID_FIELD | NVME_DNR; 3769 } 3770 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) { 3771 return NVME_INVALID_FIELD | NVME_DNR; 3772 } 3773 3774 zrasf = (dw13 >> 8) & 0xff; 3775 if (zrasf > NVME_ZONE_REPORT_OFFLINE) { 3776 return NVME_INVALID_FIELD | NVME_DNR; 3777 } 3778 3779 if (data_size < sizeof(NvmeZoneReportHeader)) { 3780 return NVME_INVALID_FIELD | NVME_DNR; 3781 } 3782 3783 status = nvme_check_mdts(n, data_size); 3784 if (status) { 3785 return status; 3786 } 3787 3788 partial = (dw13 >> 16) & 0x01; 3789 3790 zone_entry_sz = sizeof(NvmeZoneDescr); 3791 if (zra == NVME_ZONE_REPORT_EXTENDED) { 3792 zone_entry_sz += ns->params.zd_extension_size; 3793 } 3794 3795 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz; 3796 buf = g_malloc0(data_size); 3797 3798 zone = &ns->zone_array[zone_idx]; 3799 for (i = zone_idx; i < ns->num_zones; i++) { 3800 if (partial && nr_zones >= max_zones) { 3801 break; 3802 } 3803 if (nvme_zone_matches_filter(zrasf, zone++)) { 3804 nr_zones++; 3805 } 3806 } 3807 header = (NvmeZoneReportHeader *)buf; 3808 header->nr_zones = cpu_to_le64(nr_zones); 3809 3810 buf_p = buf + sizeof(NvmeZoneReportHeader); 3811 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) { 3812 zone = &ns->zone_array[zone_idx]; 3813 if (nvme_zone_matches_filter(zrasf, zone)) { 3814 z = (NvmeZoneDescr *)buf_p; 3815 buf_p += sizeof(NvmeZoneDescr); 3816 3817 z->zt = zone->d.zt; 3818 z->zs = zone->d.zs; 3819 z->zcap = cpu_to_le64(zone->d.zcap); 3820 z->zslba = cpu_to_le64(zone->d.zslba); 3821 z->za = zone->d.za; 3822 3823 if (nvme_wp_is_valid(zone)) { 3824 z->wp = cpu_to_le64(zone->d.wp); 3825 } else { 3826 z->wp = cpu_to_le64(~0ULL); 3827 } 3828 3829 if (zra == NVME_ZONE_REPORT_EXTENDED) { 3830 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) { 3831 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx), 3832 ns->params.zd_extension_size); 3833 } 3834 buf_p += ns->params.zd_extension_size; 3835 } 3836 3837 max_zones--; 3838 } 3839 } 3840 3841 status = nvme_c2h(n, (uint8_t *)buf, data_size, req); 3842 3843 g_free(buf); 3844 3845 return status; 3846 } 3847 3848 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) 3849 { 3850 NvmeNamespace *ns; 3851 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 3852 3853 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), 3854 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode)); 3855 3856 if (!nvme_nsid_valid(n, nsid)) { 3857 return NVME_INVALID_NSID | NVME_DNR; 3858 } 3859 3860 /* 3861 * In the base NVM command set, Flush may apply to all namespaces 3862 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used 3863 * along with TP 4056 (Namespace Types), it may be pretty screwed up. 3864 * 3865 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the 3866 * opcode with a specific command since we cannot determine a unique I/O 3867 * command set. Opcode 0h could have any other meaning than something 3868 * equivalent to flushing and say it DOES have completely different 3869 * semantics in some other command set - does an NSID of FFFFFFFFh then 3870 * mean "for all namespaces, apply whatever command set specific command 3871 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply 3872 * whatever command that uses the 0h opcode if, and only if, it allows NSID 3873 * to be FFFFFFFFh"? 3874 * 3875 * Anyway (and luckily), for now, we do not care about this since the 3876 * device only supports namespace types that includes the NVM Flush command 3877 * (NVM and Zoned), so always do an NVM Flush. 3878 */ 3879 if (req->cmd.opcode == NVME_CMD_FLUSH) { 3880 return nvme_flush(n, req); 3881 } 3882 3883 ns = nvme_ns(n, nsid); 3884 if (unlikely(!ns)) { 3885 return NVME_INVALID_FIELD | NVME_DNR; 3886 } 3887 3888 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { 3889 trace_pci_nvme_err_invalid_opc(req->cmd.opcode); 3890 return NVME_INVALID_OPCODE | NVME_DNR; 3891 } 3892 3893 if (ns->status) { 3894 return ns->status; 3895 } 3896 3897 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) { 3898 return NVME_INVALID_FIELD; 3899 } 3900 3901 req->ns = ns; 3902 3903 switch (req->cmd.opcode) { 3904 case NVME_CMD_WRITE_ZEROES: 3905 return nvme_write_zeroes(n, req); 3906 case NVME_CMD_ZONE_APPEND: 3907 return nvme_zone_append(n, req); 3908 case NVME_CMD_WRITE: 3909 return nvme_write(n, req); 3910 case NVME_CMD_READ: 3911 return nvme_read(n, req); 3912 case NVME_CMD_COMPARE: 3913 return nvme_compare(n, req); 3914 case NVME_CMD_DSM: 3915 return nvme_dsm(n, req); 3916 case NVME_CMD_VERIFY: 3917 return nvme_verify(n, req); 3918 case NVME_CMD_COPY: 3919 return nvme_copy(n, req); 3920 case NVME_CMD_ZONE_MGMT_SEND: 3921 return nvme_zone_mgmt_send(n, req); 3922 case NVME_CMD_ZONE_MGMT_RECV: 3923 return nvme_zone_mgmt_recv(n, req); 3924 default: 3925 assert(false); 3926 } 3927 3928 return NVME_INVALID_OPCODE | NVME_DNR; 3929 } 3930 3931 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) 3932 { 3933 n->sq[sq->sqid] = NULL; 3934 timer_free(sq->timer); 3935 g_free(sq->io_req); 3936 if (sq->sqid) { 3937 g_free(sq); 3938 } 3939 } 3940 3941 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req) 3942 { 3943 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd; 3944 NvmeRequest *r, *next; 3945 NvmeSQueue *sq; 3946 NvmeCQueue *cq; 3947 uint16_t qid = le16_to_cpu(c->qid); 3948 3949 if (unlikely(!qid || nvme_check_sqid(n, qid))) { 3950 trace_pci_nvme_err_invalid_del_sq(qid); 3951 return NVME_INVALID_QID | NVME_DNR; 3952 } 3953 3954 trace_pci_nvme_del_sq(qid); 3955 3956 sq = n->sq[qid]; 3957 while (!QTAILQ_EMPTY(&sq->out_req_list)) { 3958 r = QTAILQ_FIRST(&sq->out_req_list); 3959 assert(r->aiocb); 3960 blk_aio_cancel(r->aiocb); 3961 } 3962 3963 assert(QTAILQ_EMPTY(&sq->out_req_list)); 3964 3965 if (!nvme_check_cqid(n, sq->cqid)) { 3966 cq = n->cq[sq->cqid]; 3967 QTAILQ_REMOVE(&cq->sq_list, sq, entry); 3968 3969 nvme_post_cqes(cq); 3970 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) { 3971 if (r->sq == sq) { 3972 QTAILQ_REMOVE(&cq->req_list, r, entry); 3973 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry); 3974 } 3975 } 3976 } 3977 3978 nvme_free_sq(sq, n); 3979 return NVME_SUCCESS; 3980 } 3981 3982 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, 3983 uint16_t sqid, uint16_t cqid, uint16_t size) 3984 { 3985 int i; 3986 NvmeCQueue *cq; 3987 3988 sq->ctrl = n; 3989 sq->dma_addr = dma_addr; 3990 sq->sqid = sqid; 3991 sq->size = size; 3992 sq->cqid = cqid; 3993 sq->head = sq->tail = 0; 3994 sq->io_req = g_new0(NvmeRequest, sq->size); 3995 3996 QTAILQ_INIT(&sq->req_list); 3997 QTAILQ_INIT(&sq->out_req_list); 3998 for (i = 0; i < sq->size; i++) { 3999 sq->io_req[i].sq = sq; 4000 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry); 4001 } 4002 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq); 4003 4004 assert(n->cq[cqid]); 4005 cq = n->cq[cqid]; 4006 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry); 4007 n->sq[sqid] = sq; 4008 } 4009 4010 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req) 4011 { 4012 NvmeSQueue *sq; 4013 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd; 4014 4015 uint16_t cqid = le16_to_cpu(c->cqid); 4016 uint16_t sqid = le16_to_cpu(c->sqid); 4017 uint16_t qsize = le16_to_cpu(c->qsize); 4018 uint16_t qflags = le16_to_cpu(c->sq_flags); 4019 uint64_t prp1 = le64_to_cpu(c->prp1); 4020 4021 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags); 4022 4023 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) { 4024 trace_pci_nvme_err_invalid_create_sq_cqid(cqid); 4025 return NVME_INVALID_CQID | NVME_DNR; 4026 } 4027 if (unlikely(!sqid || sqid > n->params.max_ioqpairs || 4028 n->sq[sqid] != NULL)) { 4029 trace_pci_nvme_err_invalid_create_sq_sqid(sqid); 4030 return NVME_INVALID_QID | NVME_DNR; 4031 } 4032 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) { 4033 trace_pci_nvme_err_invalid_create_sq_size(qsize); 4034 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; 4035 } 4036 if (unlikely(prp1 & (n->page_size - 1))) { 4037 trace_pci_nvme_err_invalid_create_sq_addr(prp1); 4038 return NVME_INVALID_PRP_OFFSET | NVME_DNR; 4039 } 4040 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) { 4041 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags)); 4042 return NVME_INVALID_FIELD | NVME_DNR; 4043 } 4044 sq = g_malloc0(sizeof(*sq)); 4045 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1); 4046 return NVME_SUCCESS; 4047 } 4048 4049 struct nvme_stats { 4050 uint64_t units_read; 4051 uint64_t units_written; 4052 uint64_t read_commands; 4053 uint64_t write_commands; 4054 }; 4055 4056 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats) 4057 { 4058 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk); 4059 4060 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS; 4061 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS; 4062 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ]; 4063 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE]; 4064 } 4065 4066 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, 4067 uint64_t off, NvmeRequest *req) 4068 { 4069 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 4070 struct nvme_stats stats = { 0 }; 4071 NvmeSmartLog smart = { 0 }; 4072 uint32_t trans_len; 4073 NvmeNamespace *ns; 4074 time_t current_ms; 4075 4076 if (off >= sizeof(smart)) { 4077 return NVME_INVALID_FIELD | NVME_DNR; 4078 } 4079 4080 if (nsid != 0xffffffff) { 4081 ns = nvme_ns(n, nsid); 4082 if (!ns) { 4083 return NVME_INVALID_NSID | NVME_DNR; 4084 } 4085 nvme_set_blk_stats(ns, &stats); 4086 } else { 4087 int i; 4088 4089 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4090 ns = nvme_ns(n, i); 4091 if (!ns) { 4092 continue; 4093 } 4094 nvme_set_blk_stats(ns, &stats); 4095 } 4096 } 4097 4098 trans_len = MIN(sizeof(smart) - off, buf_len); 4099 smart.critical_warning = n->smart_critical_warning; 4100 4101 smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read, 4102 1000)); 4103 smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written, 4104 1000)); 4105 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands); 4106 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands); 4107 4108 smart.temperature = cpu_to_le16(n->temperature); 4109 4110 if ((n->temperature >= n->features.temp_thresh_hi) || 4111 (n->temperature <= n->features.temp_thresh_low)) { 4112 smart.critical_warning |= NVME_SMART_TEMPERATURE; 4113 } 4114 4115 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 4116 smart.power_on_hours[0] = 4117 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60); 4118 4119 if (!rae) { 4120 nvme_clear_events(n, NVME_AER_TYPE_SMART); 4121 } 4122 4123 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req); 4124 } 4125 4126 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, 4127 NvmeRequest *req) 4128 { 4129 uint32_t trans_len; 4130 NvmeFwSlotInfoLog fw_log = { 4131 .afi = 0x1, 4132 }; 4133 4134 if (off >= sizeof(fw_log)) { 4135 return NVME_INVALID_FIELD | NVME_DNR; 4136 } 4137 4138 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' '); 4139 trans_len = MIN(sizeof(fw_log) - off, buf_len); 4140 4141 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req); 4142 } 4143 4144 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, 4145 uint64_t off, NvmeRequest *req) 4146 { 4147 uint32_t trans_len; 4148 NvmeErrorLog errlog; 4149 4150 if (off >= sizeof(errlog)) { 4151 return NVME_INVALID_FIELD | NVME_DNR; 4152 } 4153 4154 if (!rae) { 4155 nvme_clear_events(n, NVME_AER_TYPE_ERROR); 4156 } 4157 4158 memset(&errlog, 0x0, sizeof(errlog)); 4159 trans_len = MIN(sizeof(errlog) - off, buf_len); 4160 4161 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req); 4162 } 4163 4164 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, 4165 uint64_t off, NvmeRequest *req) 4166 { 4167 uint32_t nslist[1024]; 4168 uint32_t trans_len; 4169 int i = 0; 4170 uint32_t nsid; 4171 4172 if (off >= sizeof(nslist)) { 4173 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist)); 4174 return NVME_INVALID_FIELD | NVME_DNR; 4175 } 4176 4177 memset(nslist, 0x0, sizeof(nslist)); 4178 trans_len = MIN(sizeof(nslist) - off, buf_len); 4179 4180 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) != 4181 NVME_CHANGED_NSID_SIZE) { 4182 /* 4183 * If more than 1024 namespaces, the first entry in the log page should 4184 * be set to FFFFFFFFh and the others to 0 as spec. 4185 */ 4186 if (i == ARRAY_SIZE(nslist)) { 4187 memset(nslist, 0x0, sizeof(nslist)); 4188 nslist[0] = 0xffffffff; 4189 break; 4190 } 4191 4192 nslist[i++] = nsid; 4193 clear_bit(nsid, n->changed_nsids); 4194 } 4195 4196 /* 4197 * Remove all the remaining list entries in case returns directly due to 4198 * more than 1024 namespaces. 4199 */ 4200 if (nslist[0] == 0xffffffff) { 4201 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE); 4202 } 4203 4204 if (!rae) { 4205 nvme_clear_events(n, NVME_AER_TYPE_NOTICE); 4206 } 4207 4208 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req); 4209 } 4210 4211 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, 4212 uint64_t off, NvmeRequest *req) 4213 { 4214 NvmeEffectsLog log = {}; 4215 const uint32_t *src_iocs = NULL; 4216 uint32_t trans_len; 4217 4218 if (off >= sizeof(log)) { 4219 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log)); 4220 return NVME_INVALID_FIELD | NVME_DNR; 4221 } 4222 4223 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) { 4224 case NVME_CC_CSS_NVM: 4225 src_iocs = nvme_cse_iocs_nvm; 4226 /* fall through */ 4227 case NVME_CC_CSS_ADMIN_ONLY: 4228 break; 4229 case NVME_CC_CSS_CSI: 4230 switch (csi) { 4231 case NVME_CSI_NVM: 4232 src_iocs = nvme_cse_iocs_nvm; 4233 break; 4234 case NVME_CSI_ZONED: 4235 src_iocs = nvme_cse_iocs_zoned; 4236 break; 4237 } 4238 } 4239 4240 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs)); 4241 4242 if (src_iocs) { 4243 memcpy(log.iocs, src_iocs, sizeof(log.iocs)); 4244 } 4245 4246 trans_len = MIN(sizeof(log) - off, buf_len); 4247 4248 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req); 4249 } 4250 4251 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) 4252 { 4253 NvmeCmd *cmd = &req->cmd; 4254 4255 uint32_t dw10 = le32_to_cpu(cmd->cdw10); 4256 uint32_t dw11 = le32_to_cpu(cmd->cdw11); 4257 uint32_t dw12 = le32_to_cpu(cmd->cdw12); 4258 uint32_t dw13 = le32_to_cpu(cmd->cdw13); 4259 uint8_t lid = dw10 & 0xff; 4260 uint8_t lsp = (dw10 >> 8) & 0xf; 4261 uint8_t rae = (dw10 >> 15) & 0x1; 4262 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24; 4263 uint32_t numdl, numdu; 4264 uint64_t off, lpol, lpou; 4265 size_t len; 4266 uint16_t status; 4267 4268 numdl = (dw10 >> 16); 4269 numdu = (dw11 & 0xffff); 4270 lpol = dw12; 4271 lpou = dw13; 4272 4273 len = (((numdu << 16) | numdl) + 1) << 2; 4274 off = (lpou << 32ULL) | lpol; 4275 4276 if (off & 0x3) { 4277 return NVME_INVALID_FIELD | NVME_DNR; 4278 } 4279 4280 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off); 4281 4282 status = nvme_check_mdts(n, len); 4283 if (status) { 4284 return status; 4285 } 4286 4287 switch (lid) { 4288 case NVME_LOG_ERROR_INFO: 4289 return nvme_error_info(n, rae, len, off, req); 4290 case NVME_LOG_SMART_INFO: 4291 return nvme_smart_info(n, rae, len, off, req); 4292 case NVME_LOG_FW_SLOT_INFO: 4293 return nvme_fw_log_info(n, len, off, req); 4294 case NVME_LOG_CHANGED_NSLIST: 4295 return nvme_changed_nslist(n, rae, len, off, req); 4296 case NVME_LOG_CMD_EFFECTS: 4297 return nvme_cmd_effects(n, csi, len, off, req); 4298 default: 4299 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid); 4300 return NVME_INVALID_FIELD | NVME_DNR; 4301 } 4302 } 4303 4304 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) 4305 { 4306 n->cq[cq->cqid] = NULL; 4307 timer_free(cq->timer); 4308 if (msix_enabled(&n->parent_obj)) { 4309 msix_vector_unuse(&n->parent_obj, cq->vector); 4310 } 4311 if (cq->cqid) { 4312 g_free(cq); 4313 } 4314 } 4315 4316 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req) 4317 { 4318 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd; 4319 NvmeCQueue *cq; 4320 uint16_t qid = le16_to_cpu(c->qid); 4321 4322 if (unlikely(!qid || nvme_check_cqid(n, qid))) { 4323 trace_pci_nvme_err_invalid_del_cq_cqid(qid); 4324 return NVME_INVALID_CQID | NVME_DNR; 4325 } 4326 4327 cq = n->cq[qid]; 4328 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) { 4329 trace_pci_nvme_err_invalid_del_cq_notempty(qid); 4330 return NVME_INVALID_QUEUE_DEL; 4331 } 4332 4333 if (cq->irq_enabled && cq->tail != cq->head) { 4334 n->cq_pending--; 4335 } 4336 4337 nvme_irq_deassert(n, cq); 4338 trace_pci_nvme_del_cq(qid); 4339 nvme_free_cq(cq, n); 4340 return NVME_SUCCESS; 4341 } 4342 4343 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, 4344 uint16_t cqid, uint16_t vector, uint16_t size, 4345 uint16_t irq_enabled) 4346 { 4347 int ret; 4348 4349 if (msix_enabled(&n->parent_obj)) { 4350 ret = msix_vector_use(&n->parent_obj, vector); 4351 assert(ret == 0); 4352 } 4353 cq->ctrl = n; 4354 cq->cqid = cqid; 4355 cq->size = size; 4356 cq->dma_addr = dma_addr; 4357 cq->phase = 1; 4358 cq->irq_enabled = irq_enabled; 4359 cq->vector = vector; 4360 cq->head = cq->tail = 0; 4361 QTAILQ_INIT(&cq->req_list); 4362 QTAILQ_INIT(&cq->sq_list); 4363 n->cq[cqid] = cq; 4364 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); 4365 } 4366 4367 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) 4368 { 4369 NvmeCQueue *cq; 4370 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd; 4371 uint16_t cqid = le16_to_cpu(c->cqid); 4372 uint16_t vector = le16_to_cpu(c->irq_vector); 4373 uint16_t qsize = le16_to_cpu(c->qsize); 4374 uint16_t qflags = le16_to_cpu(c->cq_flags); 4375 uint64_t prp1 = le64_to_cpu(c->prp1); 4376 4377 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags, 4378 NVME_CQ_FLAGS_IEN(qflags) != 0); 4379 4380 if (unlikely(!cqid || cqid > n->params.max_ioqpairs || 4381 n->cq[cqid] != NULL)) { 4382 trace_pci_nvme_err_invalid_create_cq_cqid(cqid); 4383 return NVME_INVALID_QID | NVME_DNR; 4384 } 4385 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) { 4386 trace_pci_nvme_err_invalid_create_cq_size(qsize); 4387 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; 4388 } 4389 if (unlikely(prp1 & (n->page_size - 1))) { 4390 trace_pci_nvme_err_invalid_create_cq_addr(prp1); 4391 return NVME_INVALID_PRP_OFFSET | NVME_DNR; 4392 } 4393 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) { 4394 trace_pci_nvme_err_invalid_create_cq_vector(vector); 4395 return NVME_INVALID_IRQ_VECTOR | NVME_DNR; 4396 } 4397 if (unlikely(vector >= n->params.msix_qsize)) { 4398 trace_pci_nvme_err_invalid_create_cq_vector(vector); 4399 return NVME_INVALID_IRQ_VECTOR | NVME_DNR; 4400 } 4401 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) { 4402 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags)); 4403 return NVME_INVALID_FIELD | NVME_DNR; 4404 } 4405 4406 cq = g_malloc0(sizeof(*cq)); 4407 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1, 4408 NVME_CQ_FLAGS_IEN(qflags)); 4409 4410 /* 4411 * It is only required to set qs_created when creating a completion queue; 4412 * creating a submission queue without a matching completion queue will 4413 * fail. 4414 */ 4415 n->qs_created = true; 4416 return NVME_SUCCESS; 4417 } 4418 4419 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req) 4420 { 4421 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; 4422 4423 return nvme_c2h(n, id, sizeof(id), req); 4424 } 4425 4426 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) 4427 { 4428 trace_pci_nvme_identify_ctrl(); 4429 4430 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req); 4431 } 4432 4433 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) 4434 { 4435 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4436 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; 4437 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id; 4438 4439 trace_pci_nvme_identify_ctrl_csi(c->csi); 4440 4441 switch (c->csi) { 4442 case NVME_CSI_NVM: 4443 id_nvm->vsl = n->params.vsl; 4444 id_nvm->dmrsl = cpu_to_le32(n->dmrsl); 4445 break; 4446 4447 case NVME_CSI_ZONED: 4448 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl; 4449 break; 4450 4451 default: 4452 return NVME_INVALID_FIELD | NVME_DNR; 4453 } 4454 4455 return nvme_c2h(n, id, sizeof(id), req); 4456 } 4457 4458 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active) 4459 { 4460 NvmeNamespace *ns; 4461 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4462 uint32_t nsid = le32_to_cpu(c->nsid); 4463 4464 trace_pci_nvme_identify_ns(nsid); 4465 4466 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 4467 return NVME_INVALID_NSID | NVME_DNR; 4468 } 4469 4470 ns = nvme_ns(n, nsid); 4471 if (unlikely(!ns)) { 4472 if (!active) { 4473 ns = nvme_subsys_ns(n->subsys, nsid); 4474 if (!ns) { 4475 return nvme_rpt_empty_id_struct(n, req); 4476 } 4477 } else { 4478 return nvme_rpt_empty_id_struct(n, req); 4479 } 4480 } 4481 4482 if (active || ns->csi == NVME_CSI_NVM) { 4483 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req); 4484 } 4485 4486 return NVME_INVALID_CMD_SET | NVME_DNR; 4487 } 4488 4489 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req, 4490 bool attached) 4491 { 4492 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4493 uint32_t nsid = le32_to_cpu(c->nsid); 4494 uint16_t min_id = le16_to_cpu(c->ctrlid); 4495 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; 4496 uint16_t *ids = &list[1]; 4497 NvmeNamespace *ns; 4498 NvmeCtrl *ctrl; 4499 int cntlid, nr_ids = 0; 4500 4501 trace_pci_nvme_identify_ctrl_list(c->cns, min_id); 4502 4503 if (!n->subsys) { 4504 return NVME_INVALID_FIELD | NVME_DNR; 4505 } 4506 4507 if (attached) { 4508 if (nsid == NVME_NSID_BROADCAST) { 4509 return NVME_INVALID_FIELD | NVME_DNR; 4510 } 4511 4512 ns = nvme_subsys_ns(n->subsys, nsid); 4513 if (!ns) { 4514 return NVME_INVALID_FIELD | NVME_DNR; 4515 } 4516 } 4517 4518 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) { 4519 ctrl = nvme_subsys_ctrl(n->subsys, cntlid); 4520 if (!ctrl) { 4521 continue; 4522 } 4523 4524 if (attached && !nvme_ns(ctrl, nsid)) { 4525 continue; 4526 } 4527 4528 ids[nr_ids++] = cntlid; 4529 } 4530 4531 list[0] = nr_ids; 4532 4533 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req); 4534 } 4535 4536 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req, 4537 bool active) 4538 { 4539 NvmeNamespace *ns; 4540 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4541 uint32_t nsid = le32_to_cpu(c->nsid); 4542 4543 trace_pci_nvme_identify_ns_csi(nsid, c->csi); 4544 4545 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 4546 return NVME_INVALID_NSID | NVME_DNR; 4547 } 4548 4549 ns = nvme_ns(n, nsid); 4550 if (unlikely(!ns)) { 4551 if (!active) { 4552 ns = nvme_subsys_ns(n->subsys, nsid); 4553 if (!ns) { 4554 return nvme_rpt_empty_id_struct(n, req); 4555 } 4556 } else { 4557 return nvme_rpt_empty_id_struct(n, req); 4558 } 4559 } 4560 4561 if (c->csi == NVME_CSI_NVM) { 4562 return nvme_rpt_empty_id_struct(n, req); 4563 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) { 4564 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned), 4565 req); 4566 } 4567 4568 return NVME_INVALID_FIELD | NVME_DNR; 4569 } 4570 4571 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req, 4572 bool active) 4573 { 4574 NvmeNamespace *ns; 4575 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4576 uint32_t min_nsid = le32_to_cpu(c->nsid); 4577 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 4578 static const int data_len = sizeof(list); 4579 uint32_t *list_ptr = (uint32_t *)list; 4580 int i, j = 0; 4581 4582 trace_pci_nvme_identify_nslist(min_nsid); 4583 4584 /* 4585 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values 4586 * since the Active Namespace ID List should return namespaces with ids 4587 * *higher* than the NSID specified in the command. This is also specified 4588 * in the spec (NVM Express v1.3d, Section 5.15.4). 4589 */ 4590 if (min_nsid >= NVME_NSID_BROADCAST - 1) { 4591 return NVME_INVALID_NSID | NVME_DNR; 4592 } 4593 4594 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4595 ns = nvme_ns(n, i); 4596 if (!ns) { 4597 if (!active) { 4598 ns = nvme_subsys_ns(n->subsys, i); 4599 if (!ns) { 4600 continue; 4601 } 4602 } else { 4603 continue; 4604 } 4605 } 4606 if (ns->params.nsid <= min_nsid) { 4607 continue; 4608 } 4609 list_ptr[j++] = cpu_to_le32(ns->params.nsid); 4610 if (j == data_len / sizeof(uint32_t)) { 4611 break; 4612 } 4613 } 4614 4615 return nvme_c2h(n, list, data_len, req); 4616 } 4617 4618 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req, 4619 bool active) 4620 { 4621 NvmeNamespace *ns; 4622 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4623 uint32_t min_nsid = le32_to_cpu(c->nsid); 4624 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 4625 static const int data_len = sizeof(list); 4626 uint32_t *list_ptr = (uint32_t *)list; 4627 int i, j = 0; 4628 4629 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi); 4630 4631 /* 4632 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid. 4633 */ 4634 if (min_nsid >= NVME_NSID_BROADCAST - 1) { 4635 return NVME_INVALID_NSID | NVME_DNR; 4636 } 4637 4638 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) { 4639 return NVME_INVALID_FIELD | NVME_DNR; 4640 } 4641 4642 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4643 ns = nvme_ns(n, i); 4644 if (!ns) { 4645 if (!active) { 4646 ns = nvme_subsys_ns(n->subsys, i); 4647 if (!ns) { 4648 continue; 4649 } 4650 } else { 4651 continue; 4652 } 4653 } 4654 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) { 4655 continue; 4656 } 4657 list_ptr[j++] = cpu_to_le32(ns->params.nsid); 4658 if (j == data_len / sizeof(uint32_t)) { 4659 break; 4660 } 4661 } 4662 4663 return nvme_c2h(n, list, data_len, req); 4664 } 4665 4666 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) 4667 { 4668 NvmeNamespace *ns; 4669 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4670 uint32_t nsid = le32_to_cpu(c->nsid); 4671 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 4672 uint8_t *pos = list; 4673 struct { 4674 NvmeIdNsDescr hdr; 4675 uint8_t v[NVME_NIDL_UUID]; 4676 } QEMU_PACKED uuid = {}; 4677 struct { 4678 NvmeIdNsDescr hdr; 4679 uint64_t v; 4680 } QEMU_PACKED eui64 = {}; 4681 struct { 4682 NvmeIdNsDescr hdr; 4683 uint8_t v; 4684 } QEMU_PACKED csi = {}; 4685 4686 trace_pci_nvme_identify_ns_descr_list(nsid); 4687 4688 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 4689 return NVME_INVALID_NSID | NVME_DNR; 4690 } 4691 4692 ns = nvme_ns(n, nsid); 4693 if (unlikely(!ns)) { 4694 return NVME_INVALID_FIELD | NVME_DNR; 4695 } 4696 4697 /* 4698 * If the EUI-64 field is 0 and the NGUID field is 0, the namespace must 4699 * provide a valid Namespace UUID in the Namespace Identification Descriptor 4700 * data structure. QEMU does not yet support setting NGUID. 4701 */ 4702 uuid.hdr.nidt = NVME_NIDT_UUID; 4703 uuid.hdr.nidl = NVME_NIDL_UUID; 4704 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID); 4705 memcpy(pos, &uuid, sizeof(uuid)); 4706 pos += sizeof(uuid); 4707 4708 if (ns->params.eui64) { 4709 eui64.hdr.nidt = NVME_NIDT_EUI64; 4710 eui64.hdr.nidl = NVME_NIDL_EUI64; 4711 eui64.v = cpu_to_be64(ns->params.eui64); 4712 memcpy(pos, &eui64, sizeof(eui64)); 4713 pos += sizeof(eui64); 4714 } 4715 4716 csi.hdr.nidt = NVME_NIDT_CSI; 4717 csi.hdr.nidl = NVME_NIDL_CSI; 4718 csi.v = ns->csi; 4719 memcpy(pos, &csi, sizeof(csi)); 4720 pos += sizeof(csi); 4721 4722 return nvme_c2h(n, list, sizeof(list), req); 4723 } 4724 4725 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req) 4726 { 4727 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; 4728 static const int data_len = sizeof(list); 4729 4730 trace_pci_nvme_identify_cmd_set(); 4731 4732 NVME_SET_CSI(*list, NVME_CSI_NVM); 4733 NVME_SET_CSI(*list, NVME_CSI_ZONED); 4734 4735 return nvme_c2h(n, list, data_len, req); 4736 } 4737 4738 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) 4739 { 4740 NvmeIdentify *c = (NvmeIdentify *)&req->cmd; 4741 4742 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid), 4743 c->csi); 4744 4745 switch (c->cns) { 4746 case NVME_ID_CNS_NS: 4747 return nvme_identify_ns(n, req, true); 4748 case NVME_ID_CNS_NS_PRESENT: 4749 return nvme_identify_ns(n, req, false); 4750 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST: 4751 return nvme_identify_ctrl_list(n, req, true); 4752 case NVME_ID_CNS_CTRL_LIST: 4753 return nvme_identify_ctrl_list(n, req, false); 4754 case NVME_ID_CNS_CS_NS: 4755 return nvme_identify_ns_csi(n, req, true); 4756 case NVME_ID_CNS_CS_NS_PRESENT: 4757 return nvme_identify_ns_csi(n, req, false); 4758 case NVME_ID_CNS_CTRL: 4759 return nvme_identify_ctrl(n, req); 4760 case NVME_ID_CNS_CS_CTRL: 4761 return nvme_identify_ctrl_csi(n, req); 4762 case NVME_ID_CNS_NS_ACTIVE_LIST: 4763 return nvme_identify_nslist(n, req, true); 4764 case NVME_ID_CNS_NS_PRESENT_LIST: 4765 return nvme_identify_nslist(n, req, false); 4766 case NVME_ID_CNS_CS_NS_ACTIVE_LIST: 4767 return nvme_identify_nslist_csi(n, req, true); 4768 case NVME_ID_CNS_CS_NS_PRESENT_LIST: 4769 return nvme_identify_nslist_csi(n, req, false); 4770 case NVME_ID_CNS_NS_DESCR_LIST: 4771 return nvme_identify_ns_descr_list(n, req); 4772 case NVME_ID_CNS_IO_COMMAND_SET: 4773 return nvme_identify_cmd_set(n, req); 4774 default: 4775 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns)); 4776 return NVME_INVALID_FIELD | NVME_DNR; 4777 } 4778 } 4779 4780 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req) 4781 { 4782 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff; 4783 4784 req->cqe.result = 1; 4785 if (nvme_check_sqid(n, sqid)) { 4786 return NVME_INVALID_FIELD | NVME_DNR; 4787 } 4788 4789 return NVME_SUCCESS; 4790 } 4791 4792 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts) 4793 { 4794 trace_pci_nvme_setfeat_timestamp(ts); 4795 4796 n->host_timestamp = le64_to_cpu(ts); 4797 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 4798 } 4799 4800 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n) 4801 { 4802 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 4803 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms; 4804 4805 union nvme_timestamp { 4806 struct { 4807 uint64_t timestamp:48; 4808 uint64_t sync:1; 4809 uint64_t origin:3; 4810 uint64_t rsvd1:12; 4811 }; 4812 uint64_t all; 4813 }; 4814 4815 union nvme_timestamp ts; 4816 ts.all = 0; 4817 ts.timestamp = n->host_timestamp + elapsed_time; 4818 4819 /* If the host timestamp is non-zero, set the timestamp origin */ 4820 ts.origin = n->host_timestamp ? 0x01 : 0x00; 4821 4822 trace_pci_nvme_getfeat_timestamp(ts.all); 4823 4824 return cpu_to_le64(ts.all); 4825 } 4826 4827 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) 4828 { 4829 uint64_t timestamp = nvme_get_timestamp(n); 4830 4831 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req); 4832 } 4833 4834 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) 4835 { 4836 NvmeCmd *cmd = &req->cmd; 4837 uint32_t dw10 = le32_to_cpu(cmd->cdw10); 4838 uint32_t dw11 = le32_to_cpu(cmd->cdw11); 4839 uint32_t nsid = le32_to_cpu(cmd->nsid); 4840 uint32_t result; 4841 uint8_t fid = NVME_GETSETFEAT_FID(dw10); 4842 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10); 4843 uint16_t iv; 4844 NvmeNamespace *ns; 4845 int i; 4846 4847 static const uint32_t nvme_feature_default[NVME_FID_MAX] = { 4848 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT, 4849 }; 4850 4851 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11); 4852 4853 if (!nvme_feature_support[fid]) { 4854 return NVME_INVALID_FIELD | NVME_DNR; 4855 } 4856 4857 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) { 4858 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { 4859 /* 4860 * The Reservation Notification Mask and Reservation Persistence 4861 * features require a status code of Invalid Field in Command when 4862 * NSID is FFFFFFFFh. Since the device does not support those 4863 * features we can always return Invalid Namespace or Format as we 4864 * should do for all other features. 4865 */ 4866 return NVME_INVALID_NSID | NVME_DNR; 4867 } 4868 4869 if (!nvme_ns(n, nsid)) { 4870 return NVME_INVALID_FIELD | NVME_DNR; 4871 } 4872 } 4873 4874 switch (sel) { 4875 case NVME_GETFEAT_SELECT_CURRENT: 4876 break; 4877 case NVME_GETFEAT_SELECT_SAVED: 4878 /* no features are saveable by the controller; fallthrough */ 4879 case NVME_GETFEAT_SELECT_DEFAULT: 4880 goto defaults; 4881 case NVME_GETFEAT_SELECT_CAP: 4882 result = nvme_feature_cap[fid]; 4883 goto out; 4884 } 4885 4886 switch (fid) { 4887 case NVME_TEMPERATURE_THRESHOLD: 4888 result = 0; 4889 4890 /* 4891 * The controller only implements the Composite Temperature sensor, so 4892 * return 0 for all other sensors. 4893 */ 4894 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { 4895 goto out; 4896 } 4897 4898 switch (NVME_TEMP_THSEL(dw11)) { 4899 case NVME_TEMP_THSEL_OVER: 4900 result = n->features.temp_thresh_hi; 4901 goto out; 4902 case NVME_TEMP_THSEL_UNDER: 4903 result = n->features.temp_thresh_low; 4904 goto out; 4905 } 4906 4907 return NVME_INVALID_FIELD | NVME_DNR; 4908 case NVME_ERROR_RECOVERY: 4909 if (!nvme_nsid_valid(n, nsid)) { 4910 return NVME_INVALID_NSID | NVME_DNR; 4911 } 4912 4913 ns = nvme_ns(n, nsid); 4914 if (unlikely(!ns)) { 4915 return NVME_INVALID_FIELD | NVME_DNR; 4916 } 4917 4918 result = ns->features.err_rec; 4919 goto out; 4920 case NVME_VOLATILE_WRITE_CACHE: 4921 result = 0; 4922 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 4923 ns = nvme_ns(n, i); 4924 if (!ns) { 4925 continue; 4926 } 4927 4928 result = blk_enable_write_cache(ns->blkconf.blk); 4929 if (result) { 4930 break; 4931 } 4932 } 4933 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled"); 4934 goto out; 4935 case NVME_ASYNCHRONOUS_EVENT_CONF: 4936 result = n->features.async_config; 4937 goto out; 4938 case NVME_TIMESTAMP: 4939 return nvme_get_feature_timestamp(n, req); 4940 default: 4941 break; 4942 } 4943 4944 defaults: 4945 switch (fid) { 4946 case NVME_TEMPERATURE_THRESHOLD: 4947 result = 0; 4948 4949 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { 4950 break; 4951 } 4952 4953 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) { 4954 result = NVME_TEMPERATURE_WARNING; 4955 } 4956 4957 break; 4958 case NVME_NUMBER_OF_QUEUES: 4959 result = (n->params.max_ioqpairs - 1) | 4960 ((n->params.max_ioqpairs - 1) << 16); 4961 trace_pci_nvme_getfeat_numq(result); 4962 break; 4963 case NVME_INTERRUPT_VECTOR_CONF: 4964 iv = dw11 & 0xffff; 4965 if (iv >= n->params.max_ioqpairs + 1) { 4966 return NVME_INVALID_FIELD | NVME_DNR; 4967 } 4968 4969 result = iv; 4970 if (iv == n->admin_cq.vector) { 4971 result |= NVME_INTVC_NOCOALESCING; 4972 } 4973 break; 4974 default: 4975 result = nvme_feature_default[fid]; 4976 break; 4977 } 4978 4979 out: 4980 req->cqe.result = cpu_to_le32(result); 4981 return NVME_SUCCESS; 4982 } 4983 4984 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) 4985 { 4986 uint16_t ret; 4987 uint64_t timestamp; 4988 4989 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req); 4990 if (ret) { 4991 return ret; 4992 } 4993 4994 nvme_set_timestamp(n, timestamp); 4995 4996 return NVME_SUCCESS; 4997 } 4998 4999 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) 5000 { 5001 NvmeNamespace *ns = NULL; 5002 5003 NvmeCmd *cmd = &req->cmd; 5004 uint32_t dw10 = le32_to_cpu(cmd->cdw10); 5005 uint32_t dw11 = le32_to_cpu(cmd->cdw11); 5006 uint32_t nsid = le32_to_cpu(cmd->nsid); 5007 uint8_t fid = NVME_GETSETFEAT_FID(dw10); 5008 uint8_t save = NVME_SETFEAT_SAVE(dw10); 5009 int i; 5010 5011 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11); 5012 5013 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) { 5014 return NVME_FID_NOT_SAVEABLE | NVME_DNR; 5015 } 5016 5017 if (!nvme_feature_support[fid]) { 5018 return NVME_INVALID_FIELD | NVME_DNR; 5019 } 5020 5021 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) { 5022 if (nsid != NVME_NSID_BROADCAST) { 5023 if (!nvme_nsid_valid(n, nsid)) { 5024 return NVME_INVALID_NSID | NVME_DNR; 5025 } 5026 5027 ns = nvme_ns(n, nsid); 5028 if (unlikely(!ns)) { 5029 return NVME_INVALID_FIELD | NVME_DNR; 5030 } 5031 } 5032 } else if (nsid && nsid != NVME_NSID_BROADCAST) { 5033 if (!nvme_nsid_valid(n, nsid)) { 5034 return NVME_INVALID_NSID | NVME_DNR; 5035 } 5036 5037 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR; 5038 } 5039 5040 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) { 5041 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; 5042 } 5043 5044 switch (fid) { 5045 case NVME_TEMPERATURE_THRESHOLD: 5046 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { 5047 break; 5048 } 5049 5050 switch (NVME_TEMP_THSEL(dw11)) { 5051 case NVME_TEMP_THSEL_OVER: 5052 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11); 5053 break; 5054 case NVME_TEMP_THSEL_UNDER: 5055 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11); 5056 break; 5057 default: 5058 return NVME_INVALID_FIELD | NVME_DNR; 5059 } 5060 5061 if ((n->temperature >= n->features.temp_thresh_hi) || 5062 (n->temperature <= n->features.temp_thresh_low)) { 5063 nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH); 5064 } 5065 5066 break; 5067 case NVME_ERROR_RECOVERY: 5068 if (nsid == NVME_NSID_BROADCAST) { 5069 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5070 ns = nvme_ns(n, i); 5071 5072 if (!ns) { 5073 continue; 5074 } 5075 5076 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) { 5077 ns->features.err_rec = dw11; 5078 } 5079 } 5080 5081 break; 5082 } 5083 5084 assert(ns); 5085 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) { 5086 ns->features.err_rec = dw11; 5087 } 5088 break; 5089 case NVME_VOLATILE_WRITE_CACHE: 5090 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5091 ns = nvme_ns(n, i); 5092 if (!ns) { 5093 continue; 5094 } 5095 5096 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) { 5097 blk_flush(ns->blkconf.blk); 5098 } 5099 5100 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1); 5101 } 5102 5103 break; 5104 5105 case NVME_NUMBER_OF_QUEUES: 5106 if (n->qs_created) { 5107 return NVME_CMD_SEQ_ERROR | NVME_DNR; 5108 } 5109 5110 /* 5111 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR 5112 * and NSQR. 5113 */ 5114 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) { 5115 return NVME_INVALID_FIELD | NVME_DNR; 5116 } 5117 5118 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1, 5119 ((dw11 >> 16) & 0xffff) + 1, 5120 n->params.max_ioqpairs, 5121 n->params.max_ioqpairs); 5122 req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) | 5123 ((n->params.max_ioqpairs - 1) << 16)); 5124 break; 5125 case NVME_ASYNCHRONOUS_EVENT_CONF: 5126 n->features.async_config = dw11; 5127 break; 5128 case NVME_TIMESTAMP: 5129 return nvme_set_feature_timestamp(n, req); 5130 case NVME_COMMAND_SET_PROFILE: 5131 if (dw11 & 0x1ff) { 5132 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff); 5133 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR; 5134 } 5135 break; 5136 default: 5137 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; 5138 } 5139 return NVME_SUCCESS; 5140 } 5141 5142 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req) 5143 { 5144 trace_pci_nvme_aer(nvme_cid(req)); 5145 5146 if (n->outstanding_aers > n->params.aerl) { 5147 trace_pci_nvme_aer_aerl_exceeded(); 5148 return NVME_AER_LIMIT_EXCEEDED; 5149 } 5150 5151 n->aer_reqs[n->outstanding_aers] = req; 5152 n->outstanding_aers++; 5153 5154 if (!QTAILQ_EMPTY(&n->aer_queue)) { 5155 nvme_process_aers(n); 5156 } 5157 5158 return NVME_NO_COMPLETE; 5159 } 5160 5161 static void nvme_update_dmrsl(NvmeCtrl *n) 5162 { 5163 int nsid; 5164 5165 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) { 5166 NvmeNamespace *ns = nvme_ns(n, nsid); 5167 if (!ns) { 5168 continue; 5169 } 5170 5171 n->dmrsl = MIN_NON_ZERO(n->dmrsl, 5172 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); 5173 } 5174 } 5175 5176 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns) 5177 { 5178 uint32_t cc = ldl_le_p(&n->bar.cc); 5179 5180 ns->iocs = nvme_cse_iocs_none; 5181 switch (ns->csi) { 5182 case NVME_CSI_NVM: 5183 if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) { 5184 ns->iocs = nvme_cse_iocs_nvm; 5185 } 5186 break; 5187 case NVME_CSI_ZONED: 5188 if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) { 5189 ns->iocs = nvme_cse_iocs_zoned; 5190 } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) { 5191 ns->iocs = nvme_cse_iocs_nvm; 5192 } 5193 break; 5194 } 5195 } 5196 5197 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req) 5198 { 5199 NvmeNamespace *ns; 5200 NvmeCtrl *ctrl; 5201 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; 5202 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 5203 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 5204 uint8_t sel = dw10 & 0xf; 5205 uint16_t *nr_ids = &list[0]; 5206 uint16_t *ids = &list[1]; 5207 uint16_t ret; 5208 int i; 5209 5210 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf); 5211 5212 if (!nvme_nsid_valid(n, nsid)) { 5213 return NVME_INVALID_NSID | NVME_DNR; 5214 } 5215 5216 ns = nvme_subsys_ns(n->subsys, nsid); 5217 if (!ns) { 5218 return NVME_INVALID_FIELD | NVME_DNR; 5219 } 5220 5221 ret = nvme_h2c(n, (uint8_t *)list, 4096, req); 5222 if (ret) { 5223 return ret; 5224 } 5225 5226 if (!*nr_ids) { 5227 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR; 5228 } 5229 5230 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1); 5231 for (i = 0; i < *nr_ids; i++) { 5232 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]); 5233 if (!ctrl) { 5234 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR; 5235 } 5236 5237 switch (sel) { 5238 case NVME_NS_ATTACHMENT_ATTACH: 5239 if (nvme_ns(ctrl, nsid)) { 5240 return NVME_NS_ALREADY_ATTACHED | NVME_DNR; 5241 } 5242 5243 if (ns->attached && !ns->params.shared) { 5244 return NVME_NS_PRIVATE | NVME_DNR; 5245 } 5246 5247 nvme_attach_ns(ctrl, ns); 5248 nvme_select_iocs_ns(ctrl, ns); 5249 5250 break; 5251 5252 case NVME_NS_ATTACHMENT_DETACH: 5253 if (!nvme_ns(ctrl, nsid)) { 5254 return NVME_NS_NOT_ATTACHED | NVME_DNR; 5255 } 5256 5257 ctrl->namespaces[nsid] = NULL; 5258 ns->attached--; 5259 5260 nvme_update_dmrsl(ctrl); 5261 5262 break; 5263 5264 default: 5265 return NVME_INVALID_FIELD | NVME_DNR; 5266 } 5267 5268 /* 5269 * Add namespace id to the changed namespace id list for event clearing 5270 * via Get Log Page command. 5271 */ 5272 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) { 5273 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE, 5274 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED, 5275 NVME_LOG_CHANGED_NSLIST); 5276 } 5277 } 5278 5279 return NVME_SUCCESS; 5280 } 5281 5282 typedef struct NvmeFormatAIOCB { 5283 BlockAIOCB common; 5284 BlockAIOCB *aiocb; 5285 QEMUBH *bh; 5286 NvmeRequest *req; 5287 int ret; 5288 5289 NvmeNamespace *ns; 5290 uint32_t nsid; 5291 bool broadcast; 5292 int64_t offset; 5293 } NvmeFormatAIOCB; 5294 5295 static void nvme_format_bh(void *opaque); 5296 5297 static void nvme_format_cancel(BlockAIOCB *aiocb) 5298 { 5299 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common); 5300 5301 if (iocb->aiocb) { 5302 blk_aio_cancel_async(iocb->aiocb); 5303 } 5304 } 5305 5306 static const AIOCBInfo nvme_format_aiocb_info = { 5307 .aiocb_size = sizeof(NvmeFormatAIOCB), 5308 .cancel_async = nvme_format_cancel, 5309 .get_aio_context = nvme_get_aio_context, 5310 }; 5311 5312 static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd) 5313 { 5314 uint32_t dw10 = le32_to_cpu(cmd->cdw10); 5315 uint8_t lbaf = dw10 & 0xf; 5316 uint8_t pi = (dw10 >> 5) & 0x7; 5317 uint8_t mset = (dw10 >> 4) & 0x1; 5318 uint8_t pil = (dw10 >> 8) & 0x1; 5319 5320 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil); 5321 5322 ns->id_ns.dps = (pil << 3) | pi; 5323 ns->id_ns.flbas = lbaf | (mset << 4); 5324 5325 nvme_ns_init_format(ns); 5326 } 5327 5328 static void nvme_format_ns_cb(void *opaque, int ret) 5329 { 5330 NvmeFormatAIOCB *iocb = opaque; 5331 NvmeRequest *req = iocb->req; 5332 NvmeNamespace *ns = iocb->ns; 5333 int bytes; 5334 5335 if (ret < 0) { 5336 iocb->ret = ret; 5337 goto done; 5338 } 5339 5340 assert(ns); 5341 5342 if (iocb->offset < ns->size) { 5343 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset); 5344 5345 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset, 5346 bytes, BDRV_REQ_MAY_UNMAP, 5347 nvme_format_ns_cb, iocb); 5348 5349 iocb->offset += bytes; 5350 return; 5351 } 5352 5353 nvme_format_set(ns, &req->cmd); 5354 ns->status = 0x0; 5355 iocb->ns = NULL; 5356 iocb->offset = 0; 5357 5358 done: 5359 iocb->aiocb = NULL; 5360 qemu_bh_schedule(iocb->bh); 5361 } 5362 5363 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi) 5364 { 5365 if (ns->params.zoned) { 5366 return NVME_INVALID_FORMAT | NVME_DNR; 5367 } 5368 5369 if (lbaf > ns->id_ns.nlbaf) { 5370 return NVME_INVALID_FORMAT | NVME_DNR; 5371 } 5372 5373 if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) { 5374 return NVME_INVALID_FORMAT | NVME_DNR; 5375 } 5376 5377 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) { 5378 return NVME_INVALID_FIELD | NVME_DNR; 5379 } 5380 5381 return NVME_SUCCESS; 5382 } 5383 5384 static void nvme_format_bh(void *opaque) 5385 { 5386 NvmeFormatAIOCB *iocb = opaque; 5387 NvmeRequest *req = iocb->req; 5388 NvmeCtrl *n = nvme_ctrl(req); 5389 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); 5390 uint8_t lbaf = dw10 & 0xf; 5391 uint8_t pi = (dw10 >> 5) & 0x7; 5392 uint16_t status; 5393 int i; 5394 5395 if (iocb->ret < 0) { 5396 goto done; 5397 } 5398 5399 if (iocb->broadcast) { 5400 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) { 5401 iocb->ns = nvme_ns(n, i); 5402 if (iocb->ns) { 5403 iocb->nsid = i; 5404 break; 5405 } 5406 } 5407 } 5408 5409 if (!iocb->ns) { 5410 goto done; 5411 } 5412 5413 status = nvme_format_check(iocb->ns, lbaf, pi); 5414 if (status) { 5415 req->status = status; 5416 goto done; 5417 } 5418 5419 iocb->ns->status = NVME_FORMAT_IN_PROGRESS; 5420 nvme_format_ns_cb(iocb, 0); 5421 return; 5422 5423 done: 5424 qemu_bh_delete(iocb->bh); 5425 iocb->bh = NULL; 5426 5427 iocb->common.cb(iocb->common.opaque, iocb->ret); 5428 5429 qemu_aio_unref(iocb); 5430 } 5431 5432 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req) 5433 { 5434 NvmeFormatAIOCB *iocb; 5435 uint32_t nsid = le32_to_cpu(req->cmd.nsid); 5436 uint16_t status; 5437 5438 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req); 5439 5440 iocb->req = req; 5441 iocb->bh = qemu_bh_new(nvme_format_bh, iocb); 5442 iocb->ret = 0; 5443 iocb->ns = NULL; 5444 iocb->nsid = 0; 5445 iocb->broadcast = (nsid == NVME_NSID_BROADCAST); 5446 iocb->offset = 0; 5447 5448 if (!iocb->broadcast) { 5449 if (!nvme_nsid_valid(n, nsid)) { 5450 status = NVME_INVALID_NSID | NVME_DNR; 5451 goto out; 5452 } 5453 5454 iocb->ns = nvme_ns(n, nsid); 5455 if (!iocb->ns) { 5456 status = NVME_INVALID_FIELD | NVME_DNR; 5457 goto out; 5458 } 5459 } 5460 5461 req->aiocb = &iocb->common; 5462 qemu_bh_schedule(iocb->bh); 5463 5464 return NVME_NO_COMPLETE; 5465 5466 out: 5467 qemu_bh_delete(iocb->bh); 5468 iocb->bh = NULL; 5469 qemu_aio_unref(iocb); 5470 return status; 5471 } 5472 5473 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) 5474 { 5475 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, 5476 nvme_adm_opc_str(req->cmd.opcode)); 5477 5478 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { 5479 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode); 5480 return NVME_INVALID_OPCODE | NVME_DNR; 5481 } 5482 5483 /* SGLs shall not be used for Admin commands in NVMe over PCIe */ 5484 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) { 5485 return NVME_INVALID_FIELD | NVME_DNR; 5486 } 5487 5488 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) { 5489 return NVME_INVALID_FIELD; 5490 } 5491 5492 switch (req->cmd.opcode) { 5493 case NVME_ADM_CMD_DELETE_SQ: 5494 return nvme_del_sq(n, req); 5495 case NVME_ADM_CMD_CREATE_SQ: 5496 return nvme_create_sq(n, req); 5497 case NVME_ADM_CMD_GET_LOG_PAGE: 5498 return nvme_get_log(n, req); 5499 case NVME_ADM_CMD_DELETE_CQ: 5500 return nvme_del_cq(n, req); 5501 case NVME_ADM_CMD_CREATE_CQ: 5502 return nvme_create_cq(n, req); 5503 case NVME_ADM_CMD_IDENTIFY: 5504 return nvme_identify(n, req); 5505 case NVME_ADM_CMD_ABORT: 5506 return nvme_abort(n, req); 5507 case NVME_ADM_CMD_SET_FEATURES: 5508 return nvme_set_feature(n, req); 5509 case NVME_ADM_CMD_GET_FEATURES: 5510 return nvme_get_feature(n, req); 5511 case NVME_ADM_CMD_ASYNC_EV_REQ: 5512 return nvme_aer(n, req); 5513 case NVME_ADM_CMD_NS_ATTACHMENT: 5514 return nvme_ns_attachment(n, req); 5515 case NVME_ADM_CMD_FORMAT_NVM: 5516 return nvme_format(n, req); 5517 default: 5518 assert(false); 5519 } 5520 5521 return NVME_INVALID_OPCODE | NVME_DNR; 5522 } 5523 5524 static void nvme_process_sq(void *opaque) 5525 { 5526 NvmeSQueue *sq = opaque; 5527 NvmeCtrl *n = sq->ctrl; 5528 NvmeCQueue *cq = n->cq[sq->cqid]; 5529 5530 uint16_t status; 5531 hwaddr addr; 5532 NvmeCmd cmd; 5533 NvmeRequest *req; 5534 5535 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) { 5536 addr = sq->dma_addr + sq->head * n->sqe_size; 5537 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) { 5538 trace_pci_nvme_err_addr_read(addr); 5539 trace_pci_nvme_err_cfs(); 5540 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED); 5541 break; 5542 } 5543 nvme_inc_sq_head(sq); 5544 5545 req = QTAILQ_FIRST(&sq->req_list); 5546 QTAILQ_REMOVE(&sq->req_list, req, entry); 5547 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry); 5548 nvme_req_clear(req); 5549 req->cqe.cid = cmd.cid; 5550 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd)); 5551 5552 status = sq->sqid ? nvme_io_cmd(n, req) : 5553 nvme_admin_cmd(n, req); 5554 if (status != NVME_NO_COMPLETE) { 5555 req->status = status; 5556 nvme_enqueue_req_completion(cq, req); 5557 } 5558 } 5559 } 5560 5561 static void nvme_ctrl_reset(NvmeCtrl *n) 5562 { 5563 NvmeNamespace *ns; 5564 int i; 5565 5566 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5567 ns = nvme_ns(n, i); 5568 if (!ns) { 5569 continue; 5570 } 5571 5572 nvme_ns_drain(ns); 5573 } 5574 5575 for (i = 0; i < n->params.max_ioqpairs + 1; i++) { 5576 if (n->sq[i] != NULL) { 5577 nvme_free_sq(n->sq[i], n); 5578 } 5579 } 5580 for (i = 0; i < n->params.max_ioqpairs + 1; i++) { 5581 if (n->cq[i] != NULL) { 5582 nvme_free_cq(n->cq[i], n); 5583 } 5584 } 5585 5586 while (!QTAILQ_EMPTY(&n->aer_queue)) { 5587 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue); 5588 QTAILQ_REMOVE(&n->aer_queue, event, entry); 5589 g_free(event); 5590 } 5591 5592 n->aer_queued = 0; 5593 n->outstanding_aers = 0; 5594 n->qs_created = false; 5595 } 5596 5597 static void nvme_ctrl_shutdown(NvmeCtrl *n) 5598 { 5599 NvmeNamespace *ns; 5600 int i; 5601 5602 if (n->pmr.dev) { 5603 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size); 5604 } 5605 5606 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5607 ns = nvme_ns(n, i); 5608 if (!ns) { 5609 continue; 5610 } 5611 5612 nvme_ns_shutdown(ns); 5613 } 5614 } 5615 5616 static void nvme_select_iocs(NvmeCtrl *n) 5617 { 5618 NvmeNamespace *ns; 5619 int i; 5620 5621 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 5622 ns = nvme_ns(n, i); 5623 if (!ns) { 5624 continue; 5625 } 5626 5627 nvme_select_iocs_ns(n, ns); 5628 } 5629 } 5630 5631 static int nvme_start_ctrl(NvmeCtrl *n) 5632 { 5633 uint64_t cap = ldq_le_p(&n->bar.cap); 5634 uint32_t cc = ldl_le_p(&n->bar.cc); 5635 uint32_t aqa = ldl_le_p(&n->bar.aqa); 5636 uint64_t asq = ldq_le_p(&n->bar.asq); 5637 uint64_t acq = ldq_le_p(&n->bar.acq); 5638 uint32_t page_bits = NVME_CC_MPS(cc) + 12; 5639 uint32_t page_size = 1 << page_bits; 5640 5641 if (unlikely(n->cq[0])) { 5642 trace_pci_nvme_err_startfail_cq(); 5643 return -1; 5644 } 5645 if (unlikely(n->sq[0])) { 5646 trace_pci_nvme_err_startfail_sq(); 5647 return -1; 5648 } 5649 if (unlikely(asq & (page_size - 1))) { 5650 trace_pci_nvme_err_startfail_asq_misaligned(asq); 5651 return -1; 5652 } 5653 if (unlikely(acq & (page_size - 1))) { 5654 trace_pci_nvme_err_startfail_acq_misaligned(acq); 5655 return -1; 5656 } 5657 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) { 5658 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc)); 5659 return -1; 5660 } 5661 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) { 5662 trace_pci_nvme_err_startfail_page_too_small( 5663 NVME_CC_MPS(cc), 5664 NVME_CAP_MPSMIN(cap)); 5665 return -1; 5666 } 5667 if (unlikely(NVME_CC_MPS(cc) > 5668 NVME_CAP_MPSMAX(cap))) { 5669 trace_pci_nvme_err_startfail_page_too_large( 5670 NVME_CC_MPS(cc), 5671 NVME_CAP_MPSMAX(cap)); 5672 return -1; 5673 } 5674 if (unlikely(NVME_CC_IOCQES(cc) < 5675 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) { 5676 trace_pci_nvme_err_startfail_cqent_too_small( 5677 NVME_CC_IOCQES(cc), 5678 NVME_CTRL_CQES_MIN(cap)); 5679 return -1; 5680 } 5681 if (unlikely(NVME_CC_IOCQES(cc) > 5682 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) { 5683 trace_pci_nvme_err_startfail_cqent_too_large( 5684 NVME_CC_IOCQES(cc), 5685 NVME_CTRL_CQES_MAX(cap)); 5686 return -1; 5687 } 5688 if (unlikely(NVME_CC_IOSQES(cc) < 5689 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) { 5690 trace_pci_nvme_err_startfail_sqent_too_small( 5691 NVME_CC_IOSQES(cc), 5692 NVME_CTRL_SQES_MIN(cap)); 5693 return -1; 5694 } 5695 if (unlikely(NVME_CC_IOSQES(cc) > 5696 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) { 5697 trace_pci_nvme_err_startfail_sqent_too_large( 5698 NVME_CC_IOSQES(cc), 5699 NVME_CTRL_SQES_MAX(cap)); 5700 return -1; 5701 } 5702 if (unlikely(!NVME_AQA_ASQS(aqa))) { 5703 trace_pci_nvme_err_startfail_asqent_sz_zero(); 5704 return -1; 5705 } 5706 if (unlikely(!NVME_AQA_ACQS(aqa))) { 5707 trace_pci_nvme_err_startfail_acqent_sz_zero(); 5708 return -1; 5709 } 5710 5711 n->page_bits = page_bits; 5712 n->page_size = page_size; 5713 n->max_prp_ents = n->page_size / sizeof(uint64_t); 5714 n->cqe_size = 1 << NVME_CC_IOCQES(cc); 5715 n->sqe_size = 1 << NVME_CC_IOSQES(cc); 5716 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1); 5717 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1); 5718 5719 nvme_set_timestamp(n, 0ULL); 5720 5721 QTAILQ_INIT(&n->aer_queue); 5722 5723 nvme_select_iocs(n); 5724 5725 return 0; 5726 } 5727 5728 static void nvme_cmb_enable_regs(NvmeCtrl *n) 5729 { 5730 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc); 5731 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz); 5732 5733 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1); 5734 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1); 5735 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR); 5736 stl_le_p(&n->bar.cmbloc, cmbloc); 5737 5738 NVME_CMBSZ_SET_SQS(cmbsz, 1); 5739 NVME_CMBSZ_SET_CQS(cmbsz, 0); 5740 NVME_CMBSZ_SET_LISTS(cmbsz, 1); 5741 NVME_CMBSZ_SET_RDS(cmbsz, 1); 5742 NVME_CMBSZ_SET_WDS(cmbsz, 1); 5743 NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */ 5744 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb); 5745 stl_le_p(&n->bar.cmbsz, cmbsz); 5746 } 5747 5748 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, 5749 unsigned size) 5750 { 5751 uint64_t cap = ldq_le_p(&n->bar.cap); 5752 uint32_t cc = ldl_le_p(&n->bar.cc); 5753 uint32_t intms = ldl_le_p(&n->bar.intms); 5754 uint32_t csts = ldl_le_p(&n->bar.csts); 5755 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts); 5756 5757 if (unlikely(offset & (sizeof(uint32_t) - 1))) { 5758 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32, 5759 "MMIO write not 32-bit aligned," 5760 " offset=0x%"PRIx64"", offset); 5761 /* should be ignored, fall through for now */ 5762 } 5763 5764 if (unlikely(size < sizeof(uint32_t))) { 5765 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall, 5766 "MMIO write smaller than 32-bits," 5767 " offset=0x%"PRIx64", size=%u", 5768 offset, size); 5769 /* should be ignored, fall through for now */ 5770 } 5771 5772 switch (offset) { 5773 case NVME_REG_INTMS: 5774 if (unlikely(msix_enabled(&(n->parent_obj)))) { 5775 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix, 5776 "undefined access to interrupt mask set" 5777 " when MSI-X is enabled"); 5778 /* should be ignored, fall through for now */ 5779 } 5780 intms |= data; 5781 stl_le_p(&n->bar.intms, intms); 5782 n->bar.intmc = n->bar.intms; 5783 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms); 5784 nvme_irq_check(n); 5785 break; 5786 case NVME_REG_INTMC: 5787 if (unlikely(msix_enabled(&(n->parent_obj)))) { 5788 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix, 5789 "undefined access to interrupt mask clr" 5790 " when MSI-X is enabled"); 5791 /* should be ignored, fall through for now */ 5792 } 5793 intms &= ~data; 5794 stl_le_p(&n->bar.intms, intms); 5795 n->bar.intmc = n->bar.intms; 5796 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms); 5797 nvme_irq_check(n); 5798 break; 5799 case NVME_REG_CC: 5800 trace_pci_nvme_mmio_cfg(data & 0xffffffff); 5801 5802 /* Windows first sends data, then sends enable bit */ 5803 if (!NVME_CC_EN(data) && !NVME_CC_EN(cc) && 5804 !NVME_CC_SHN(data) && !NVME_CC_SHN(cc)) 5805 { 5806 cc = data; 5807 } 5808 5809 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) { 5810 cc = data; 5811 5812 /* flush CC since nvme_start_ctrl() needs the value */ 5813 stl_le_p(&n->bar.cc, cc); 5814 if (unlikely(nvme_start_ctrl(n))) { 5815 trace_pci_nvme_err_startfail(); 5816 csts = NVME_CSTS_FAILED; 5817 } else { 5818 trace_pci_nvme_mmio_start_success(); 5819 csts = NVME_CSTS_READY; 5820 } 5821 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) { 5822 trace_pci_nvme_mmio_stopped(); 5823 nvme_ctrl_reset(n); 5824 cc = 0; 5825 csts &= ~NVME_CSTS_READY; 5826 } 5827 5828 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) { 5829 trace_pci_nvme_mmio_shutdown_set(); 5830 nvme_ctrl_shutdown(n); 5831 cc = data; 5832 csts |= NVME_CSTS_SHST_COMPLETE; 5833 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) { 5834 trace_pci_nvme_mmio_shutdown_cleared(); 5835 csts &= ~NVME_CSTS_SHST_COMPLETE; 5836 cc = data; 5837 } 5838 5839 stl_le_p(&n->bar.cc, cc); 5840 stl_le_p(&n->bar.csts, csts); 5841 5842 break; 5843 case NVME_REG_CSTS: 5844 if (data & (1 << 4)) { 5845 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported, 5846 "attempted to W1C CSTS.NSSRO" 5847 " but CAP.NSSRS is zero (not supported)"); 5848 } else if (data != 0) { 5849 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts, 5850 "attempted to set a read only bit" 5851 " of controller status"); 5852 } 5853 break; 5854 case NVME_REG_NSSR: 5855 if (data == 0x4e564d65) { 5856 trace_pci_nvme_ub_mmiowr_ssreset_unsupported(); 5857 } else { 5858 /* The spec says that writes of other values have no effect */ 5859 return; 5860 } 5861 break; 5862 case NVME_REG_AQA: 5863 stl_le_p(&n->bar.aqa, data); 5864 trace_pci_nvme_mmio_aqattr(data & 0xffffffff); 5865 break; 5866 case NVME_REG_ASQ: 5867 stn_le_p(&n->bar.asq, size, data); 5868 trace_pci_nvme_mmio_asqaddr(data); 5869 break; 5870 case NVME_REG_ASQ + 4: 5871 stl_le_p((uint8_t *)&n->bar.asq + 4, data); 5872 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq)); 5873 break; 5874 case NVME_REG_ACQ: 5875 trace_pci_nvme_mmio_acqaddr(data); 5876 stn_le_p(&n->bar.acq, size, data); 5877 break; 5878 case NVME_REG_ACQ + 4: 5879 stl_le_p((uint8_t *)&n->bar.acq + 4, data); 5880 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq)); 5881 break; 5882 case NVME_REG_CMBLOC: 5883 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved, 5884 "invalid write to reserved CMBLOC" 5885 " when CMBSZ is zero, ignored"); 5886 return; 5887 case NVME_REG_CMBSZ: 5888 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly, 5889 "invalid write to read only CMBSZ, ignored"); 5890 return; 5891 case NVME_REG_CMBMSC: 5892 if (!NVME_CAP_CMBS(cap)) { 5893 return; 5894 } 5895 5896 stn_le_p(&n->bar.cmbmsc, size, data); 5897 n->cmb.cmse = false; 5898 5899 if (NVME_CMBMSC_CRE(data)) { 5900 nvme_cmb_enable_regs(n); 5901 5902 if (NVME_CMBMSC_CMSE(data)) { 5903 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc); 5904 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT; 5905 if (cba + int128_get64(n->cmb.mem.size) < cba) { 5906 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts); 5907 NVME_CMBSTS_SET_CBAI(cmbsts, 1); 5908 stl_le_p(&n->bar.cmbsts, cmbsts); 5909 return; 5910 } 5911 5912 n->cmb.cba = cba; 5913 n->cmb.cmse = true; 5914 } 5915 } else { 5916 n->bar.cmbsz = 0; 5917 n->bar.cmbloc = 0; 5918 } 5919 5920 return; 5921 case NVME_REG_CMBMSC + 4: 5922 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data); 5923 return; 5924 5925 case NVME_REG_PMRCAP: 5926 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly, 5927 "invalid write to PMRCAP register, ignored"); 5928 return; 5929 case NVME_REG_PMRCTL: 5930 if (!NVME_CAP_PMRS(cap)) { 5931 return; 5932 } 5933 5934 stl_le_p(&n->bar.pmrctl, data); 5935 if (NVME_PMRCTL_EN(data)) { 5936 memory_region_set_enabled(&n->pmr.dev->mr, true); 5937 pmrsts = 0; 5938 } else { 5939 memory_region_set_enabled(&n->pmr.dev->mr, false); 5940 NVME_PMRSTS_SET_NRDY(pmrsts, 1); 5941 n->pmr.cmse = false; 5942 } 5943 stl_le_p(&n->bar.pmrsts, pmrsts); 5944 return; 5945 case NVME_REG_PMRSTS: 5946 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly, 5947 "invalid write to PMRSTS register, ignored"); 5948 return; 5949 case NVME_REG_PMREBS: 5950 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly, 5951 "invalid write to PMREBS register, ignored"); 5952 return; 5953 case NVME_REG_PMRSWTP: 5954 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly, 5955 "invalid write to PMRSWTP register, ignored"); 5956 return; 5957 case NVME_REG_PMRMSCL: 5958 if (!NVME_CAP_PMRS(cap)) { 5959 return; 5960 } 5961 5962 stl_le_p(&n->bar.pmrmscl, data); 5963 n->pmr.cmse = false; 5964 5965 if (NVME_PMRMSCL_CMSE(data)) { 5966 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu); 5967 hwaddr cba = pmrmscu << 32 | 5968 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT); 5969 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) { 5970 NVME_PMRSTS_SET_CBAI(pmrsts, 1); 5971 stl_le_p(&n->bar.pmrsts, pmrsts); 5972 return; 5973 } 5974 5975 n->pmr.cmse = true; 5976 n->pmr.cba = cba; 5977 } 5978 5979 return; 5980 case NVME_REG_PMRMSCU: 5981 if (!NVME_CAP_PMRS(cap)) { 5982 return; 5983 } 5984 5985 stl_le_p(&n->bar.pmrmscu, data); 5986 return; 5987 default: 5988 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid, 5989 "invalid MMIO write," 5990 " offset=0x%"PRIx64", data=%"PRIx64"", 5991 offset, data); 5992 break; 5993 } 5994 } 5995 5996 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) 5997 { 5998 NvmeCtrl *n = (NvmeCtrl *)opaque; 5999 uint8_t *ptr = (uint8_t *)&n->bar; 6000 6001 trace_pci_nvme_mmio_read(addr, size); 6002 6003 if (unlikely(addr & (sizeof(uint32_t) - 1))) { 6004 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32, 6005 "MMIO read not 32-bit aligned," 6006 " offset=0x%"PRIx64"", addr); 6007 /* should RAZ, fall through for now */ 6008 } else if (unlikely(size < sizeof(uint32_t))) { 6009 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall, 6010 "MMIO read smaller than 32-bits," 6011 " offset=0x%"PRIx64"", addr); 6012 /* should RAZ, fall through for now */ 6013 } 6014 6015 if (addr > sizeof(n->bar) - size) { 6016 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs, 6017 "MMIO read beyond last register," 6018 " offset=0x%"PRIx64", returning 0", addr); 6019 6020 return 0; 6021 } 6022 6023 /* 6024 * When PMRWBM bit 1 is set then read from 6025 * from PMRSTS should ensure prior writes 6026 * made it to persistent media 6027 */ 6028 if (addr == NVME_REG_PMRSTS && 6029 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) { 6030 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size); 6031 } 6032 6033 return ldn_le_p(ptr + addr, size); 6034 } 6035 6036 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) 6037 { 6038 uint32_t qid; 6039 6040 if (unlikely(addr & ((1 << 2) - 1))) { 6041 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned, 6042 "doorbell write not 32-bit aligned," 6043 " offset=0x%"PRIx64", ignoring", addr); 6044 return; 6045 } 6046 6047 if (((addr - 0x1000) >> 2) & 1) { 6048 /* Completion queue doorbell write */ 6049 6050 uint16_t new_head = val & 0xffff; 6051 int start_sqs; 6052 NvmeCQueue *cq; 6053 6054 qid = (addr - (0x1000 + (1 << 2))) >> 3; 6055 if (unlikely(nvme_check_cqid(n, qid))) { 6056 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq, 6057 "completion queue doorbell write" 6058 " for nonexistent queue," 6059 " sqid=%"PRIu32", ignoring", qid); 6060 6061 /* 6062 * NVM Express v1.3d, Section 4.1 state: "If host software writes 6063 * an invalid value to the Submission Queue Tail Doorbell or 6064 * Completion Queue Head Doorbell regiter and an Asynchronous Event 6065 * Request command is outstanding, then an asynchronous event is 6066 * posted to the Admin Completion Queue with a status code of 6067 * Invalid Doorbell Write Value." 6068 * 6069 * Also note that the spec includes the "Invalid Doorbell Register" 6070 * status code, but nowhere does it specify when to use it. 6071 * However, it seems reasonable to use it here in a similar 6072 * fashion. 6073 */ 6074 if (n->outstanding_aers) { 6075 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 6076 NVME_AER_INFO_ERR_INVALID_DB_REGISTER, 6077 NVME_LOG_ERROR_INFO); 6078 } 6079 6080 return; 6081 } 6082 6083 cq = n->cq[qid]; 6084 if (unlikely(new_head >= cq->size)) { 6085 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead, 6086 "completion queue doorbell write value" 6087 " beyond queue size, sqid=%"PRIu32"," 6088 " new_head=%"PRIu16", ignoring", 6089 qid, new_head); 6090 6091 if (n->outstanding_aers) { 6092 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 6093 NVME_AER_INFO_ERR_INVALID_DB_VALUE, 6094 NVME_LOG_ERROR_INFO); 6095 } 6096 6097 return; 6098 } 6099 6100 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head); 6101 6102 start_sqs = nvme_cq_full(cq) ? 1 : 0; 6103 cq->head = new_head; 6104 if (start_sqs) { 6105 NvmeSQueue *sq; 6106 QTAILQ_FOREACH(sq, &cq->sq_list, entry) { 6107 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); 6108 } 6109 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); 6110 } 6111 6112 if (cq->tail == cq->head) { 6113 if (cq->irq_enabled) { 6114 n->cq_pending--; 6115 } 6116 6117 nvme_irq_deassert(n, cq); 6118 } 6119 } else { 6120 /* Submission queue doorbell write */ 6121 6122 uint16_t new_tail = val & 0xffff; 6123 NvmeSQueue *sq; 6124 6125 qid = (addr - 0x1000) >> 3; 6126 if (unlikely(nvme_check_sqid(n, qid))) { 6127 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq, 6128 "submission queue doorbell write" 6129 " for nonexistent queue," 6130 " sqid=%"PRIu32", ignoring", qid); 6131 6132 if (n->outstanding_aers) { 6133 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 6134 NVME_AER_INFO_ERR_INVALID_DB_REGISTER, 6135 NVME_LOG_ERROR_INFO); 6136 } 6137 6138 return; 6139 } 6140 6141 sq = n->sq[qid]; 6142 if (unlikely(new_tail >= sq->size)) { 6143 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail, 6144 "submission queue doorbell write value" 6145 " beyond queue size, sqid=%"PRIu32"," 6146 " new_tail=%"PRIu16", ignoring", 6147 qid, new_tail); 6148 6149 if (n->outstanding_aers) { 6150 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, 6151 NVME_AER_INFO_ERR_INVALID_DB_VALUE, 6152 NVME_LOG_ERROR_INFO); 6153 } 6154 6155 return; 6156 } 6157 6158 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail); 6159 6160 sq->tail = new_tail; 6161 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); 6162 } 6163 } 6164 6165 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data, 6166 unsigned size) 6167 { 6168 NvmeCtrl *n = (NvmeCtrl *)opaque; 6169 6170 trace_pci_nvme_mmio_write(addr, data, size); 6171 6172 if (addr < sizeof(n->bar)) { 6173 nvme_write_bar(n, addr, data, size); 6174 } else { 6175 nvme_process_db(n, addr, data); 6176 } 6177 } 6178 6179 static const MemoryRegionOps nvme_mmio_ops = { 6180 .read = nvme_mmio_read, 6181 .write = nvme_mmio_write, 6182 .endianness = DEVICE_LITTLE_ENDIAN, 6183 .impl = { 6184 .min_access_size = 2, 6185 .max_access_size = 8, 6186 }, 6187 }; 6188 6189 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data, 6190 unsigned size) 6191 { 6192 NvmeCtrl *n = (NvmeCtrl *)opaque; 6193 stn_le_p(&n->cmb.buf[addr], size, data); 6194 } 6195 6196 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size) 6197 { 6198 NvmeCtrl *n = (NvmeCtrl *)opaque; 6199 return ldn_le_p(&n->cmb.buf[addr], size); 6200 } 6201 6202 static const MemoryRegionOps nvme_cmb_ops = { 6203 .read = nvme_cmb_read, 6204 .write = nvme_cmb_write, 6205 .endianness = DEVICE_LITTLE_ENDIAN, 6206 .impl = { 6207 .min_access_size = 1, 6208 .max_access_size = 8, 6209 }, 6210 }; 6211 6212 static void nvme_check_constraints(NvmeCtrl *n, Error **errp) 6213 { 6214 NvmeParams *params = &n->params; 6215 6216 if (params->num_queues) { 6217 warn_report("num_queues is deprecated; please use max_ioqpairs " 6218 "instead"); 6219 6220 params->max_ioqpairs = params->num_queues - 1; 6221 } 6222 6223 if (n->namespace.blkconf.blk && n->subsys) { 6224 error_setg(errp, "subsystem support is unavailable with legacy " 6225 "namespace ('drive' property)"); 6226 return; 6227 } 6228 6229 if (params->max_ioqpairs < 1 || 6230 params->max_ioqpairs > NVME_MAX_IOQPAIRS) { 6231 error_setg(errp, "max_ioqpairs must be between 1 and %d", 6232 NVME_MAX_IOQPAIRS); 6233 return; 6234 } 6235 6236 if (params->msix_qsize < 1 || 6237 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) { 6238 error_setg(errp, "msix_qsize must be between 1 and %d", 6239 PCI_MSIX_FLAGS_QSIZE + 1); 6240 return; 6241 } 6242 6243 if (!params->serial) { 6244 error_setg(errp, "serial property not set"); 6245 return; 6246 } 6247 6248 if (n->pmr.dev) { 6249 if (host_memory_backend_is_mapped(n->pmr.dev)) { 6250 error_setg(errp, "can't use already busy memdev: %s", 6251 object_get_canonical_path_component(OBJECT(n->pmr.dev))); 6252 return; 6253 } 6254 6255 if (!is_power_of_2(n->pmr.dev->size)) { 6256 error_setg(errp, "pmr backend size needs to be power of 2 in size"); 6257 return; 6258 } 6259 6260 host_memory_backend_set_mapped(n->pmr.dev, true); 6261 } 6262 6263 if (n->params.zasl > n->params.mdts) { 6264 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less " 6265 "than or equal to mdts (Maximum Data Transfer Size)"); 6266 return; 6267 } 6268 6269 if (!n->params.vsl) { 6270 error_setg(errp, "vsl must be non-zero"); 6271 return; 6272 } 6273 } 6274 6275 static void nvme_init_state(NvmeCtrl *n) 6276 { 6277 /* add one to max_ioqpairs to account for the admin queue pair */ 6278 n->reg_size = pow2ceil(sizeof(NvmeBar) + 6279 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE); 6280 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1); 6281 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1); 6282 n->temperature = NVME_TEMPERATURE; 6283 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING; 6284 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); 6285 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1); 6286 } 6287 6288 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) 6289 { 6290 uint64_t cmb_size = n->params.cmb_size_mb * MiB; 6291 uint64_t cap = ldq_le_p(&n->bar.cap); 6292 6293 n->cmb.buf = g_malloc0(cmb_size); 6294 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n, 6295 "nvme-cmb", cmb_size); 6296 pci_register_bar(pci_dev, NVME_CMB_BIR, 6297 PCI_BASE_ADDRESS_SPACE_MEMORY | 6298 PCI_BASE_ADDRESS_MEM_TYPE_64 | 6299 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem); 6300 6301 NVME_CAP_SET_CMBS(cap, 1); 6302 stq_le_p(&n->bar.cap, cap); 6303 6304 if (n->params.legacy_cmb) { 6305 nvme_cmb_enable_regs(n); 6306 n->cmb.cmse = true; 6307 } 6308 } 6309 6310 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) 6311 { 6312 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap); 6313 6314 NVME_PMRCAP_SET_RDS(pmrcap, 1); 6315 NVME_PMRCAP_SET_WDS(pmrcap, 1); 6316 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR); 6317 /* Turn on bit 1 support */ 6318 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02); 6319 NVME_PMRCAP_SET_CMSS(pmrcap, 1); 6320 stl_le_p(&n->bar.pmrcap, pmrcap); 6321 6322 pci_register_bar(pci_dev, NVME_PMR_BIR, 6323 PCI_BASE_ADDRESS_SPACE_MEMORY | 6324 PCI_BASE_ADDRESS_MEM_TYPE_64 | 6325 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr); 6326 6327 memory_region_set_enabled(&n->pmr.dev->mr, false); 6328 } 6329 6330 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) 6331 { 6332 uint8_t *pci_conf = pci_dev->config; 6333 uint64_t bar_size, msix_table_size, msix_pba_size; 6334 unsigned msix_table_offset, msix_pba_offset; 6335 int ret; 6336 6337 Error *err = NULL; 6338 6339 pci_conf[PCI_INTERRUPT_PIN] = 1; 6340 pci_config_set_prog_interface(pci_conf, 0x2); 6341 6342 if (n->params.use_intel_id) { 6343 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL); 6344 pci_config_set_device_id(pci_conf, 0x5845); 6345 } else { 6346 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT); 6347 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME); 6348 } 6349 6350 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS); 6351 pcie_endpoint_cap_init(pci_dev, 0x80); 6352 6353 bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB); 6354 msix_table_offset = bar_size; 6355 msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize; 6356 6357 bar_size += msix_table_size; 6358 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB); 6359 msix_pba_offset = bar_size; 6360 msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8; 6361 6362 bar_size += msix_pba_size; 6363 bar_size = pow2ceil(bar_size); 6364 6365 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size); 6366 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme", 6367 n->reg_size); 6368 memory_region_add_subregion(&n->bar0, 0, &n->iomem); 6369 6370 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | 6371 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0); 6372 ret = msix_init(pci_dev, n->params.msix_qsize, 6373 &n->bar0, 0, msix_table_offset, 6374 &n->bar0, 0, msix_pba_offset, 0, &err); 6375 if (ret < 0) { 6376 if (ret == -ENOTSUP) { 6377 warn_report_err(err); 6378 } else { 6379 error_propagate(errp, err); 6380 return ret; 6381 } 6382 } 6383 6384 if (n->params.cmb_size_mb) { 6385 nvme_init_cmb(n, pci_dev); 6386 } 6387 6388 if (n->pmr.dev) { 6389 nvme_init_pmr(n, pci_dev); 6390 } 6391 6392 return 0; 6393 } 6394 6395 static void nvme_init_subnqn(NvmeCtrl *n) 6396 { 6397 NvmeSubsystem *subsys = n->subsys; 6398 NvmeIdCtrl *id = &n->id_ctrl; 6399 6400 if (!subsys) { 6401 snprintf((char *)id->subnqn, sizeof(id->subnqn), 6402 "nqn.2019-08.org.qemu:%s", n->params.serial); 6403 } else { 6404 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn); 6405 } 6406 } 6407 6408 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) 6409 { 6410 NvmeIdCtrl *id = &n->id_ctrl; 6411 uint8_t *pci_conf = pci_dev->config; 6412 uint64_t cap = ldq_le_p(&n->bar.cap); 6413 6414 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID)); 6415 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID)); 6416 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' '); 6417 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' '); 6418 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' '); 6419 6420 id->cntlid = cpu_to_le16(n->cntlid); 6421 6422 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR); 6423 6424 id->rab = 6; 6425 6426 if (n->params.use_intel_id) { 6427 id->ieee[0] = 0xb3; 6428 id->ieee[1] = 0x02; 6429 id->ieee[2] = 0x00; 6430 } else { 6431 id->ieee[0] = 0x00; 6432 id->ieee[1] = 0x54; 6433 id->ieee[2] = 0x52; 6434 } 6435 6436 id->mdts = n->params.mdts; 6437 id->ver = cpu_to_le32(NVME_SPEC_VER); 6438 id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT); 6439 id->cntrltype = 0x1; 6440 6441 /* 6442 * Because the controller always completes the Abort command immediately, 6443 * there can never be more than one concurrently executing Abort command, 6444 * so this value is never used for anything. Note that there can easily be 6445 * many Abort commands in the queues, but they are not considered 6446 * "executing" until processed by nvme_abort. 6447 * 6448 * The specification recommends a value of 3 for Abort Command Limit (four 6449 * concurrently outstanding Abort commands), so lets use that though it is 6450 * inconsequential. 6451 */ 6452 id->acl = 3; 6453 id->aerl = n->params.aerl; 6454 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO; 6455 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED; 6456 6457 /* recommended default value (~70 C) */ 6458 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING); 6459 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL); 6460 6461 id->sqes = (0x6 << 4) | 0x6; 6462 id->cqes = (0x4 << 4) | 0x4; 6463 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES); 6464 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | 6465 NVME_ONCS_FEATURES | NVME_ONCS_DSM | 6466 NVME_ONCS_COMPARE | NVME_ONCS_COPY); 6467 6468 /* 6469 * NOTE: If this device ever supports a command set that does NOT use 0x0 6470 * as a Flush-equivalent operation, support for the broadcast NSID in Flush 6471 * should probably be removed. 6472 * 6473 * See comment in nvme_io_cmd. 6474 */ 6475 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT; 6476 6477 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0); 6478 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | 6479 NVME_CTRL_SGLS_BITBUCKET); 6480 6481 nvme_init_subnqn(n); 6482 6483 id->psd[0].mp = cpu_to_le16(0x9c4); 6484 id->psd[0].enlat = cpu_to_le32(0x10); 6485 id->psd[0].exlat = cpu_to_le32(0x4); 6486 6487 if (n->subsys) { 6488 id->cmic |= NVME_CMIC_MULTI_CTRL; 6489 } 6490 6491 NVME_CAP_SET_MQES(cap, 0x7ff); 6492 NVME_CAP_SET_CQR(cap, 1); 6493 NVME_CAP_SET_TO(cap, 0xf); 6494 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM); 6495 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP); 6496 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY); 6497 NVME_CAP_SET_MPSMAX(cap, 4); 6498 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0); 6499 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0); 6500 stq_le_p(&n->bar.cap, cap); 6501 6502 stl_le_p(&n->bar.vs, NVME_SPEC_VER); 6503 n->bar.intmc = n->bar.intms = 0; 6504 } 6505 6506 static int nvme_init_subsys(NvmeCtrl *n, Error **errp) 6507 { 6508 int cntlid; 6509 6510 if (!n->subsys) { 6511 return 0; 6512 } 6513 6514 cntlid = nvme_subsys_register_ctrl(n, errp); 6515 if (cntlid < 0) { 6516 return -1; 6517 } 6518 6519 n->cntlid = cntlid; 6520 6521 return 0; 6522 } 6523 6524 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns) 6525 { 6526 uint32_t nsid = ns->params.nsid; 6527 assert(nsid && nsid <= NVME_MAX_NAMESPACES); 6528 6529 n->namespaces[nsid] = ns; 6530 ns->attached++; 6531 6532 n->dmrsl = MIN_NON_ZERO(n->dmrsl, 6533 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); 6534 } 6535 6536 static void nvme_realize(PCIDevice *pci_dev, Error **errp) 6537 { 6538 NvmeCtrl *n = NVME(pci_dev); 6539 NvmeNamespace *ns; 6540 Error *local_err = NULL; 6541 6542 nvme_check_constraints(n, &local_err); 6543 if (local_err) { 6544 error_propagate(errp, local_err); 6545 return; 6546 } 6547 6548 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, 6549 &pci_dev->qdev, n->parent_obj.qdev.id); 6550 6551 nvme_init_state(n); 6552 if (nvme_init_pci(n, pci_dev, errp)) { 6553 return; 6554 } 6555 6556 if (nvme_init_subsys(n, errp)) { 6557 error_propagate(errp, local_err); 6558 return; 6559 } 6560 nvme_init_ctrl(n, pci_dev); 6561 6562 /* setup a namespace if the controller drive property was given */ 6563 if (n->namespace.blkconf.blk) { 6564 ns = &n->namespace; 6565 ns->params.nsid = 1; 6566 6567 if (nvme_ns_setup(ns, errp)) { 6568 return; 6569 } 6570 6571 nvme_attach_ns(n, ns); 6572 } 6573 } 6574 6575 static void nvme_exit(PCIDevice *pci_dev) 6576 { 6577 NvmeCtrl *n = NVME(pci_dev); 6578 NvmeNamespace *ns; 6579 int i; 6580 6581 nvme_ctrl_reset(n); 6582 6583 if (n->subsys) { 6584 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 6585 ns = nvme_ns(n, i); 6586 if (ns) { 6587 ns->attached--; 6588 } 6589 } 6590 6591 nvme_subsys_unregister_ctrl(n->subsys, n); 6592 } 6593 6594 g_free(n->cq); 6595 g_free(n->sq); 6596 g_free(n->aer_reqs); 6597 6598 if (n->params.cmb_size_mb) { 6599 g_free(n->cmb.buf); 6600 } 6601 6602 if (n->pmr.dev) { 6603 host_memory_backend_set_mapped(n->pmr.dev, false); 6604 } 6605 msix_uninit(pci_dev, &n->bar0, &n->bar0); 6606 memory_region_del_subregion(&n->bar0, &n->iomem); 6607 } 6608 6609 static Property nvme_props[] = { 6610 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf), 6611 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND, 6612 HostMemoryBackend *), 6613 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS, 6614 NvmeSubsystem *), 6615 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial), 6616 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0), 6617 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0), 6618 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64), 6619 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65), 6620 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3), 6621 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64), 6622 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7), 6623 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7), 6624 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), 6625 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), 6626 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), 6627 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl, 6628 params.auto_transition_zones, true), 6629 DEFINE_PROP_END_OF_LIST(), 6630 }; 6631 6632 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name, 6633 void *opaque, Error **errp) 6634 { 6635 NvmeCtrl *n = NVME(obj); 6636 uint8_t value = n->smart_critical_warning; 6637 6638 visit_type_uint8(v, name, &value, errp); 6639 } 6640 6641 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name, 6642 void *opaque, Error **errp) 6643 { 6644 NvmeCtrl *n = NVME(obj); 6645 uint8_t value, old_value, cap = 0, index, event; 6646 6647 if (!visit_type_uint8(v, name, &value, errp)) { 6648 return; 6649 } 6650 6651 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY 6652 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA; 6653 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) { 6654 cap |= NVME_SMART_PMR_UNRELIABLE; 6655 } 6656 6657 if ((value & cap) != value) { 6658 error_setg(errp, "unsupported smart critical warning bits: 0x%x", 6659 value & ~cap); 6660 return; 6661 } 6662 6663 old_value = n->smart_critical_warning; 6664 n->smart_critical_warning = value; 6665 6666 /* only inject new bits of smart critical warning */ 6667 for (index = 0; index < NVME_SMART_WARN_MAX; index++) { 6668 event = 1 << index; 6669 if (value & ~old_value & event) 6670 nvme_smart_event(n, event); 6671 } 6672 } 6673 6674 static const VMStateDescription nvme_vmstate = { 6675 .name = "nvme", 6676 .unmigratable = 1, 6677 }; 6678 6679 static void nvme_class_init(ObjectClass *oc, void *data) 6680 { 6681 DeviceClass *dc = DEVICE_CLASS(oc); 6682 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc); 6683 6684 pc->realize = nvme_realize; 6685 pc->exit = nvme_exit; 6686 pc->class_id = PCI_CLASS_STORAGE_EXPRESS; 6687 pc->revision = 2; 6688 6689 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); 6690 dc->desc = "Non-Volatile Memory Express"; 6691 device_class_set_props(dc, nvme_props); 6692 dc->vmsd = &nvme_vmstate; 6693 } 6694 6695 static void nvme_instance_init(Object *obj) 6696 { 6697 NvmeCtrl *n = NVME(obj); 6698 6699 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex, 6700 "bootindex", "/namespace@1,0", 6701 DEVICE(obj)); 6702 6703 object_property_add(obj, "smart_critical_warning", "uint8", 6704 nvme_get_smart_warning, 6705 nvme_set_smart_warning, NULL, NULL); 6706 } 6707 6708 static const TypeInfo nvme_info = { 6709 .name = TYPE_NVME, 6710 .parent = TYPE_PCI_DEVICE, 6711 .instance_size = sizeof(NvmeCtrl), 6712 .instance_init = nvme_instance_init, 6713 .class_init = nvme_class_init, 6714 .interfaces = (InterfaceInfo[]) { 6715 { INTERFACE_PCIE_DEVICE }, 6716 { } 6717 }, 6718 }; 6719 6720 static const TypeInfo nvme_bus_info = { 6721 .name = TYPE_NVME_BUS, 6722 .parent = TYPE_BUS, 6723 .instance_size = sizeof(NvmeBus), 6724 }; 6725 6726 static void nvme_register_types(void) 6727 { 6728 type_register_static(&nvme_info); 6729 type_register_static(&nvme_bus_info); 6730 } 6731 6732 type_init(nvme_register_types) 6733