1 /* 2 * QEMU NVM Express Virtual Namespace 3 * 4 * Copyright (c) 2019 CNEX Labs 5 * Copyright (c) 2020 Samsung Electronics 6 * 7 * Authors: 8 * Klaus Jensen <k.jensen@samsung.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2. See the 11 * COPYING file in the top-level directory. 12 * 13 */ 14 15 #include "qemu/osdep.h" 16 #include "qemu/units.h" 17 #include "qemu/error-report.h" 18 #include "qapi/error.h" 19 #include "sysemu/sysemu.h" 20 #include "sysemu/block-backend.h" 21 22 #include "nvme.h" 23 #include "trace.h" 24 25 #define MIN_DISCARD_GRANULARITY (4 * KiB) 26 #define NVME_DEFAULT_ZONE_SIZE (128 * MiB) 27 28 void nvme_ns_init_format(NvmeNamespace *ns) 29 { 30 NvmeIdNs *id_ns = &ns->id_ns; 31 BlockDriverInfo bdi; 32 int npdg, nlbas, ret; 33 34 ns->lbaf = id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(id_ns->flbas)]; 35 ns->lbasz = 1 << ns->lbaf.ds; 36 37 nlbas = ns->size / (ns->lbasz + ns->lbaf.ms); 38 39 id_ns->nsze = cpu_to_le64(nlbas); 40 41 /* no thin provisioning */ 42 id_ns->ncap = id_ns->nsze; 43 id_ns->nuse = id_ns->ncap; 44 45 ns->moff = (int64_t)nlbas << ns->lbaf.ds; 46 47 npdg = ns->blkconf.discard_granularity / ns->lbasz; 48 49 ret = bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi); 50 if (ret >= 0 && bdi.cluster_size > ns->blkconf.discard_granularity) { 51 npdg = bdi.cluster_size / ns->lbasz; 52 } 53 54 id_ns->npda = id_ns->npdg = npdg - 1; 55 } 56 57 static int nvme_ns_init(NvmeNamespace *ns, Error **errp) 58 { 59 static uint64_t ns_count; 60 NvmeIdNs *id_ns = &ns->id_ns; 61 uint8_t ds; 62 uint16_t ms; 63 int i; 64 65 ns->csi = NVME_CSI_NVM; 66 ns->status = 0x0; 67 68 ns->id_ns.dlfeat = 0x1; 69 70 /* support DULBE and I/O optimization fields */ 71 id_ns->nsfeat |= (0x4 | 0x10); 72 73 if (ns->params.shared) { 74 id_ns->nmic |= NVME_NMIC_NS_SHARED; 75 } 76 77 /* Substitute a missing EUI-64 by an autogenerated one */ 78 ++ns_count; 79 if (!ns->params.eui64 && ns->params.eui64_default) { 80 ns->params.eui64 = ns_count + NVME_EUI64_DEFAULT; 81 } 82 83 /* simple copy */ 84 id_ns->mssrl = cpu_to_le16(ns->params.mssrl); 85 id_ns->mcl = cpu_to_le32(ns->params.mcl); 86 id_ns->msrc = ns->params.msrc; 87 id_ns->eui64 = cpu_to_be64(ns->params.eui64); 88 89 ds = 31 - clz32(ns->blkconf.logical_block_size); 90 ms = ns->params.ms; 91 92 id_ns->mc = NVME_ID_NS_MC_EXTENDED | NVME_ID_NS_MC_SEPARATE; 93 94 if (ms && ns->params.mset) { 95 id_ns->flbas |= NVME_ID_NS_FLBAS_EXTENDED; 96 } 97 98 id_ns->dpc = 0x1f; 99 id_ns->dps = ns->params.pi; 100 if (ns->params.pi && ns->params.pil) { 101 id_ns->dps |= NVME_ID_NS_DPS_FIRST_EIGHT; 102 } 103 104 static const NvmeLBAF lbaf[16] = { 105 [0] = { .ds = 9 }, 106 [1] = { .ds = 9, .ms = 8 }, 107 [2] = { .ds = 9, .ms = 16 }, 108 [3] = { .ds = 9, .ms = 64 }, 109 [4] = { .ds = 12 }, 110 [5] = { .ds = 12, .ms = 8 }, 111 [6] = { .ds = 12, .ms = 16 }, 112 [7] = { .ds = 12, .ms = 64 }, 113 }; 114 115 memcpy(&id_ns->lbaf, &lbaf, sizeof(lbaf)); 116 id_ns->nlbaf = 7; 117 118 for (i = 0; i <= id_ns->nlbaf; i++) { 119 NvmeLBAF *lbaf = &id_ns->lbaf[i]; 120 if (lbaf->ds == ds) { 121 if (lbaf->ms == ms) { 122 id_ns->flbas |= i; 123 goto lbaf_found; 124 } 125 } 126 } 127 128 /* add non-standard lba format */ 129 id_ns->nlbaf++; 130 id_ns->lbaf[id_ns->nlbaf].ds = ds; 131 id_ns->lbaf[id_ns->nlbaf].ms = ms; 132 id_ns->flbas |= id_ns->nlbaf; 133 134 lbaf_found: 135 nvme_ns_init_format(ns); 136 137 return 0; 138 } 139 140 static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp) 141 { 142 bool read_only; 143 144 if (!blkconf_blocksizes(&ns->blkconf, errp)) { 145 return -1; 146 } 147 148 read_only = !blk_supports_write_perm(ns->blkconf.blk); 149 if (!blkconf_apply_backend_options(&ns->blkconf, read_only, false, errp)) { 150 return -1; 151 } 152 153 if (ns->blkconf.discard_granularity == -1) { 154 ns->blkconf.discard_granularity = 155 MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY); 156 } 157 158 ns->size = blk_getlength(ns->blkconf.blk); 159 if (ns->size < 0) { 160 error_setg_errno(errp, -ns->size, "could not get blockdev size"); 161 return -1; 162 } 163 164 return 0; 165 } 166 167 static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp) 168 { 169 uint64_t zone_size, zone_cap; 170 171 /* Make sure that the values of ZNS properties are sane */ 172 if (ns->params.zone_size_bs) { 173 zone_size = ns->params.zone_size_bs; 174 } else { 175 zone_size = NVME_DEFAULT_ZONE_SIZE; 176 } 177 if (ns->params.zone_cap_bs) { 178 zone_cap = ns->params.zone_cap_bs; 179 } else { 180 zone_cap = zone_size; 181 } 182 if (zone_cap > zone_size) { 183 error_setg(errp, "zone capacity %"PRIu64"B exceeds " 184 "zone size %"PRIu64"B", zone_cap, zone_size); 185 return -1; 186 } 187 if (zone_size < ns->lbasz) { 188 error_setg(errp, "zone size %"PRIu64"B too small, " 189 "must be at least %zuB", zone_size, ns->lbasz); 190 return -1; 191 } 192 if (zone_cap < ns->lbasz) { 193 error_setg(errp, "zone capacity %"PRIu64"B too small, " 194 "must be at least %zuB", zone_cap, ns->lbasz); 195 return -1; 196 } 197 198 /* 199 * Save the main zone geometry values to avoid 200 * calculating them later again. 201 */ 202 ns->zone_size = zone_size / ns->lbasz; 203 ns->zone_capacity = zone_cap / ns->lbasz; 204 ns->num_zones = le64_to_cpu(ns->id_ns.nsze) / ns->zone_size; 205 206 /* Do a few more sanity checks of ZNS properties */ 207 if (!ns->num_zones) { 208 error_setg(errp, 209 "insufficient drive capacity, must be at least the size " 210 "of one zone (%"PRIu64"B)", zone_size); 211 return -1; 212 } 213 214 return 0; 215 } 216 217 static void nvme_ns_zoned_init_state(NvmeNamespace *ns) 218 { 219 uint64_t start = 0, zone_size = ns->zone_size; 220 uint64_t capacity = ns->num_zones * zone_size; 221 NvmeZone *zone; 222 int i; 223 224 ns->zone_array = g_new0(NvmeZone, ns->num_zones); 225 if (ns->params.zd_extension_size) { 226 ns->zd_extensions = g_malloc0(ns->params.zd_extension_size * 227 ns->num_zones); 228 } 229 230 QTAILQ_INIT(&ns->exp_open_zones); 231 QTAILQ_INIT(&ns->imp_open_zones); 232 QTAILQ_INIT(&ns->closed_zones); 233 QTAILQ_INIT(&ns->full_zones); 234 235 zone = ns->zone_array; 236 for (i = 0; i < ns->num_zones; i++, zone++) { 237 if (start + zone_size > capacity) { 238 zone_size = capacity - start; 239 } 240 zone->d.zt = NVME_ZONE_TYPE_SEQ_WRITE; 241 nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY); 242 zone->d.za = 0; 243 zone->d.zcap = ns->zone_capacity; 244 zone->d.zslba = start; 245 zone->d.wp = start; 246 zone->w_ptr = start; 247 start += zone_size; 248 } 249 250 ns->zone_size_log2 = 0; 251 if (is_power_of_2(ns->zone_size)) { 252 ns->zone_size_log2 = 63 - clz64(ns->zone_size); 253 } 254 } 255 256 static void nvme_ns_init_zoned(NvmeNamespace *ns) 257 { 258 NvmeIdNsZoned *id_ns_z; 259 int i; 260 261 nvme_ns_zoned_init_state(ns); 262 263 id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned)); 264 265 /* MAR/MOR are zeroes-based, FFFFFFFFFh means no limit */ 266 id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1); 267 id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1); 268 id_ns_z->zoc = 0; 269 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00; 270 271 for (i = 0; i <= ns->id_ns.nlbaf; i++) { 272 id_ns_z->lbafe[i].zsze = cpu_to_le64(ns->zone_size); 273 id_ns_z->lbafe[i].zdes = 274 ns->params.zd_extension_size >> 6; /* Units of 64B */ 275 } 276 277 ns->csi = NVME_CSI_ZONED; 278 ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size); 279 ns->id_ns.ncap = ns->id_ns.nsze; 280 ns->id_ns.nuse = ns->id_ns.ncap; 281 282 /* 283 * The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated" 284 * status of logical blocks. Since the spec defines that logical blocks 285 * SHALL be deallocated when then zone is in the Empty or Offline states, 286 * we can only support DULBE if the zone size is a multiple of the 287 * calculated NPDG. 288 */ 289 if (ns->zone_size % (ns->id_ns.npdg + 1)) { 290 warn_report("the zone size (%"PRIu64" blocks) is not a multiple of " 291 "the calculated deallocation granularity (%d blocks); " 292 "DULBE support disabled", 293 ns->zone_size, ns->id_ns.npdg + 1); 294 295 ns->id_ns.nsfeat &= ~0x4; 296 } 297 298 ns->id_ns_zoned = id_ns_z; 299 } 300 301 static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone) 302 { 303 uint8_t state; 304 305 zone->w_ptr = zone->d.wp; 306 state = nvme_get_zone_state(zone); 307 if (zone->d.wp != zone->d.zslba || 308 (zone->d.za & NVME_ZA_ZD_EXT_VALID)) { 309 if (state != NVME_ZONE_STATE_CLOSED) { 310 trace_pci_nvme_clear_ns_close(state, zone->d.zslba); 311 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED); 312 } 313 nvme_aor_inc_active(ns); 314 QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry); 315 } else { 316 trace_pci_nvme_clear_ns_reset(state, zone->d.zslba); 317 nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY); 318 } 319 } 320 321 /* 322 * Close all the zones that are currently open. 323 */ 324 static void nvme_zoned_ns_shutdown(NvmeNamespace *ns) 325 { 326 NvmeZone *zone, *next; 327 328 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { 329 QTAILQ_REMOVE(&ns->closed_zones, zone, entry); 330 nvme_aor_dec_active(ns); 331 nvme_clear_zone(ns, zone); 332 } 333 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { 334 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); 335 nvme_aor_dec_open(ns); 336 nvme_aor_dec_active(ns); 337 nvme_clear_zone(ns, zone); 338 } 339 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { 340 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry); 341 nvme_aor_dec_open(ns); 342 nvme_aor_dec_active(ns); 343 nvme_clear_zone(ns, zone); 344 } 345 346 assert(ns->nr_open_zones == 0); 347 } 348 349 static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp) 350 { 351 if (!ns->blkconf.blk) { 352 error_setg(errp, "block backend not configured"); 353 return -1; 354 } 355 356 if (ns->params.pi && ns->params.ms < 8) { 357 error_setg(errp, "at least 8 bytes of metadata required to enable " 358 "protection information"); 359 return -1; 360 } 361 362 if (ns->params.nsid > NVME_MAX_NAMESPACES) { 363 error_setg(errp, "invalid namespace id (must be between 0 and %d)", 364 NVME_MAX_NAMESPACES); 365 return -1; 366 } 367 368 if (ns->params.zoned) { 369 if (ns->params.max_active_zones) { 370 if (ns->params.max_open_zones > ns->params.max_active_zones) { 371 error_setg(errp, "max_open_zones (%u) exceeds " 372 "max_active_zones (%u)", ns->params.max_open_zones, 373 ns->params.max_active_zones); 374 return -1; 375 } 376 377 if (!ns->params.max_open_zones) { 378 ns->params.max_open_zones = ns->params.max_active_zones; 379 } 380 } 381 382 if (ns->params.zd_extension_size) { 383 if (ns->params.zd_extension_size & 0x3f) { 384 error_setg(errp, "zone descriptor extension size must be a " 385 "multiple of 64B"); 386 return -1; 387 } 388 if ((ns->params.zd_extension_size >> 6) > 0xff) { 389 error_setg(errp, 390 "zone descriptor extension size is too large"); 391 return -1; 392 } 393 } 394 } 395 396 return 0; 397 } 398 399 int nvme_ns_setup(NvmeNamespace *ns, Error **errp) 400 { 401 if (nvme_ns_check_constraints(ns, errp)) { 402 return -1; 403 } 404 405 if (nvme_ns_init_blk(ns, errp)) { 406 return -1; 407 } 408 409 if (nvme_ns_init(ns, errp)) { 410 return -1; 411 } 412 if (ns->params.zoned) { 413 if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) { 414 return -1; 415 } 416 nvme_ns_init_zoned(ns); 417 } 418 419 return 0; 420 } 421 422 void nvme_ns_drain(NvmeNamespace *ns) 423 { 424 blk_drain(ns->blkconf.blk); 425 } 426 427 void nvme_ns_shutdown(NvmeNamespace *ns) 428 { 429 blk_flush(ns->blkconf.blk); 430 if (ns->params.zoned) { 431 nvme_zoned_ns_shutdown(ns); 432 } 433 } 434 435 void nvme_ns_cleanup(NvmeNamespace *ns) 436 { 437 if (ns->params.zoned) { 438 g_free(ns->id_ns_zoned); 439 g_free(ns->zone_array); 440 g_free(ns->zd_extensions); 441 } 442 } 443 444 static void nvme_ns_unrealize(DeviceState *dev) 445 { 446 NvmeNamespace *ns = NVME_NS(dev); 447 448 nvme_ns_drain(ns); 449 nvme_ns_shutdown(ns); 450 nvme_ns_cleanup(ns); 451 } 452 453 static void nvme_ns_realize(DeviceState *dev, Error **errp) 454 { 455 NvmeNamespace *ns = NVME_NS(dev); 456 BusState *s = qdev_get_parent_bus(dev); 457 NvmeCtrl *n = NVME(s->parent); 458 NvmeSubsystem *subsys = n->subsys; 459 uint32_t nsid = ns->params.nsid; 460 int i; 461 462 if (!n->subsys) { 463 if (ns->params.detached) { 464 error_setg(errp, "detached requires that the nvme device is " 465 "linked to an nvme-subsys device"); 466 return; 467 } 468 469 if (ns->params.shared) { 470 error_setg(errp, "shared requires that the nvme device is " 471 "linked to an nvme-subsys device"); 472 return; 473 } 474 } else { 475 /* 476 * If this namespace belongs to a subsystem (through a link on the 477 * controller device), reparent the device. 478 */ 479 if (!qdev_set_parent_bus(dev, &subsys->bus.parent_bus, errp)) { 480 return; 481 } 482 } 483 484 if (nvme_ns_setup(ns, errp)) { 485 return; 486 } 487 488 if (!nsid) { 489 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 490 if (nvme_ns(n, i) || nvme_subsys_ns(subsys, i)) { 491 continue; 492 } 493 494 nsid = ns->params.nsid = i; 495 break; 496 } 497 498 if (!nsid) { 499 error_setg(errp, "no free namespace id"); 500 return; 501 } 502 } else { 503 if (nvme_ns(n, nsid) || nvme_subsys_ns(subsys, nsid)) { 504 error_setg(errp, "namespace id '%d' already allocated", nsid); 505 return; 506 } 507 } 508 509 if (subsys) { 510 subsys->namespaces[nsid] = ns; 511 512 if (ns->params.detached) { 513 return; 514 } 515 516 if (ns->params.shared) { 517 for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) { 518 NvmeCtrl *ctrl = subsys->ctrls[i]; 519 520 if (ctrl) { 521 nvme_attach_ns(ctrl, ns); 522 } 523 } 524 525 return; 526 } 527 } 528 529 nvme_attach_ns(n, ns); 530 } 531 532 static Property nvme_ns_props[] = { 533 DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf), 534 DEFINE_PROP_BOOL("detached", NvmeNamespace, params.detached, false), 535 DEFINE_PROP_BOOL("shared", NvmeNamespace, params.shared, false), 536 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0), 537 DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid), 538 DEFINE_PROP_UINT64("eui64", NvmeNamespace, params.eui64, 0), 539 DEFINE_PROP_UINT16("ms", NvmeNamespace, params.ms, 0), 540 DEFINE_PROP_UINT8("mset", NvmeNamespace, params.mset, 0), 541 DEFINE_PROP_UINT8("pi", NvmeNamespace, params.pi, 0), 542 DEFINE_PROP_UINT8("pil", NvmeNamespace, params.pil, 0), 543 DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128), 544 DEFINE_PROP_UINT32("mcl", NvmeNamespace, params.mcl, 128), 545 DEFINE_PROP_UINT8("msrc", NvmeNamespace, params.msrc, 127), 546 DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false), 547 DEFINE_PROP_SIZE("zoned.zone_size", NvmeNamespace, params.zone_size_bs, 548 NVME_DEFAULT_ZONE_SIZE), 549 DEFINE_PROP_SIZE("zoned.zone_capacity", NvmeNamespace, params.zone_cap_bs, 550 0), 551 DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace, 552 params.cross_zone_read, false), 553 DEFINE_PROP_UINT32("zoned.max_active", NvmeNamespace, 554 params.max_active_zones, 0), 555 DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace, 556 params.max_open_zones, 0), 557 DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace, 558 params.zd_extension_size, 0), 559 DEFINE_PROP_BOOL("eui64-default", NvmeNamespace, params.eui64_default, 560 true), 561 DEFINE_PROP_END_OF_LIST(), 562 }; 563 564 static void nvme_ns_class_init(ObjectClass *oc, void *data) 565 { 566 DeviceClass *dc = DEVICE_CLASS(oc); 567 568 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); 569 570 dc->bus_type = TYPE_NVME_BUS; 571 dc->realize = nvme_ns_realize; 572 dc->unrealize = nvme_ns_unrealize; 573 device_class_set_props(dc, nvme_ns_props); 574 dc->desc = "Virtual NVMe namespace"; 575 } 576 577 static void nvme_ns_instance_init(Object *obj) 578 { 579 NvmeNamespace *ns = NVME_NS(obj); 580 char *bootindex = g_strdup_printf("/namespace@%d,0", ns->params.nsid); 581 582 device_add_bootindex_property(obj, &ns->bootindex, "bootindex", 583 bootindex, DEVICE(obj)); 584 585 g_free(bootindex); 586 } 587 588 static const TypeInfo nvme_ns_info = { 589 .name = TYPE_NVME_NS, 590 .parent = TYPE_DEVICE, 591 .class_init = nvme_ns_class_init, 592 .instance_size = sizeof(NvmeNamespace), 593 .instance_init = nvme_ns_instance_init, 594 }; 595 596 static void nvme_ns_register_types(void) 597 { 598 type_register_static(&nvme_ns_info); 599 } 600 601 type_init(nvme_ns_register_types) 602